Monday 24 June 2013

Cropping images (batchCropAirlinersnet.py)

For nearly a year, I have been putting off sorting out my Python installation to handle images. My old macmini install was not up to the job, due to a lack of Xcode support, so I have finally bitten the bullet and resorted to solving this image crop problem using Windows on my laptop.

Cropping watermarks is an on-going problem on Wikimedia Commons, this code can only expect to crop a strip off the image, but in the example of the "credit bar" added to images at airliners.net, this was all that was needed. In the short term this will apply to around 11,000 images. In the longer term, I'll probably integrate it into the batch upload routine as expectations are that 200,000+ images may be available for release.

Python source code



#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
# batchCropAirlinersnet.py
# A one-off tool to crop watermarks off uploads to Commons from airliners.net.
# These are probably the bottom 12 pixels, but this script included a test of
# pixel rgb values to make sure.
#
# PIL must be available in Python for this to work.
#
# Date: 24 June 2013
# Author: Fae, http://j.mp/faewm
# Permissions: CC-BY-SA, please use for public benefit
'''

import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, hashlib, subprocess
from BeautifulSoup import BeautifulSoup
from sys import argv
import collections
from time import sleep
from os import remove
import Image

Cyan=""
White=""
Yellow=""

# Upload a file to Commons
#        filename              = source file, local or external url
#        pagetitle             = Commons file name
#        desc                  = Image page text (or comment if over-writing)
#        ignoreWarning         = Use with caution
def up(filename, pagetitle, desc, ignoreWarning):
    url = filename
    keepFilename=True        #set to True to skip double-checking/editing destination filename
    verifyDescription=False    #set to False to skip double-checking/editing description => change to bot-mode
    targetSite = wikipedia.getSite('commons', 'commons')
    bot = upload.UploadRobot(url, description=desc, useFilename=pagetitle, keepFilename=keepFilename, verifyDescription=verifyDescription, targetSite = targetSite,ignoreWarning=ignoreWarning)
    bot.upload_image(debug=True)

# Open a url defensively, created to cope with a very poor internet connection
def urltry(u):
    headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
    countErr=0
    x=''
    while x=='':
            try:
                    req = urllib2.Request(u,None,headers)
                    x = urllib2.urlopen(req)
                    time.sleep(1)
            except:
                    x=''
                    countErr+=1
                    if countErr>300: countErr=300    #    5 minutes between read attempts, eventually
                    print '** ERROR',countErr,'\n ** Failed to read from '+Yellow+u+Cyan+'\n ** Pause for '+str(countErr*1)+' seconds and try again ['+time.strftime("%H:%M:%S")+']',White
                    time.sleep(1*countErr)
    return x

# Read text from url defensively
def htmlreadtry(x,u):
        countErr=0
        r=True
        while r:
                try:
                        return x.read()
                except:
                        x=urltry(u)
                        countErr+=1
                        if countErr>200:
                                p=300
                        else:
                                p=countErr*2
                        print '** ERROR',countErr,'\n ** Failed to read xml/html'
                        if countErr==1:
                                print 'xml ='+str(x)
                                print 'url ='+u
                        print ' ** Pause for '+str(p)+' seconds and try again'
                        time.sleep(p)
                else:
                        r=False
        return

skip=0
if len(argv)>1:
    skip=int(float(argv[1]))    #    Not working on windows?

category=u"Airliners.net photos (watermarked)"
site = wikipedia.getSite('commons', 'commons')
cat = catlib.Category(site,u'Category:'+category)
gen = pagegenerators.CategorizedPageGenerator(cat,recurse=False)
workdir="D:/"    # Scratch USB stick, change to where working directory is
count=0
uploadcount=0

for i in gen:
    count+=1
    if count<=skip:
        continue
    # test for watermark template
    html=i.get()
    if html.find("{{watermark}}")==-1:
        print "No watermark template found in",i.title()
        continue
    print count,"Downloading",i.title()
    fn=re.sub("File:","",i.title())    #    Source file name
    fnout=fn[0:-4]+"_crop"+fn[-4:]    #    Cropped file name (locally)
    # discover where full size image is starting with image page title
    api="http://commons.wikimedia.org/w/api.php?action=query&prop=imageinfo&iiprop=url&format=xml&titles=File:"+urllib.quote(fn)
    url=urltry(api)
    xml=htmlreadtry(url,api)
    if xml.find("<ii url")==-1:
        print "Problem finding url from",api
        continue
    source=xml.split('<ii url="')[1].split('"')[0]
    localfile=workdir+fn
    urllib.urlretrieve(source, localfile)    # Dowload image
    # test and crop
    Im=Image.open(localfile)
    pix=Im.load()
    height=Im.size[1]
    width=Im.size[0]
    pxtest=False
    if sum(pix[0,height-12])<30 and sum(pix[0,height-13])>30:
            pxtest=True
    else:
            if sum(pix[0,height-12])<30 and sum(pix[int(width/2),height-12])<30 and sum(pix[int(width/2),height-13])>30:
                    print "Relied on second pixel test"
                    pxtest=True
    if pxtest:
            Imnew=Im.crop((0,0,width,height-12))
            Imnew.save(workdir+fnout,'jpeg',quality=98)    #    quality=99 makes file 40 to 50% larger, 98 only slightly larger or slightly smaller
    else:
            print "Failed?",0,height-12,"=",pix[0,height-12],0,height-13,"=",pix[0,height-13]
            localfile="D:\\"+fn
            remove(localfile)
            print "_"*60
            sleep(10)
            continue
    # upload
    html=re.sub("\{\{watermark\}\}\n*","",html)
    html=re.sub("\n*\[\[Category:Airliners.net photos .watermarked.\]\]","",html)
    comment="Crop bottom 12 pixels to remove watermark ("+str(width)+"x"+str(height-12)+")"
    up(workdir+fnout,"File:"+fn,comment,True)    #    upload image
    wikipedia.setAction("Remove {{watermark}} after image crop")
    i.put(html)    #    upload revised text
    localfile="D:\\"+fn
    remove(localfile)
    uploadcount+=1
    print "Total",uploadcount,"_"*(64-len(str(uploadcount)))
    sleep(30)    #    lag to ensure human oversight