Cropping images (batchCropAirlinersnet.py)
For nearly a year, I have been putting off sorting out my Python installation to handle images. My old macmini install was not up to the job, due to a lack of Xcode support, so I have finally bitten the bullet and resorted to solving this image crop problem using Windows on my laptop.Cropping watermarks is an on-going problem on Wikimedia Commons, this code can only expect to crop a strip off the image, but in the example of the "credit bar" added to images at airliners.net, this was all that was needed. In the short term this will apply to around 11,000 images. In the longer term, I'll probably integrate it into the batch upload routine as expectations are that 200,000+ images may be available for release.
Python source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
# batchCropAirlinersnet.py
# A one-off tool to crop watermarks off uploads to Commons from airliners.net.
# These are probably the bottom 12 pixels, but this script included a test of
# pixel rgb values to make sure.
#
# PIL must be available in Python for this to work.
#
# Date: 24 June 2013
# Author: Fae, http://j.mp/faewm
# Permissions: CC-BY-SA, please use for public benefit
'''
import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, hashlib, subprocess
from BeautifulSoup import BeautifulSoup
from sys import argv
import collections
from time import sleep
from os import remove
import Image
Cyan=""
White=""
Yellow=""
# Upload a file to Commons
# filename = source file, local or external url
# pagetitle = Commons file name
# desc = Image page text (or comment if over-writing)
# ignoreWarning = Use with caution
def up(filename, pagetitle, desc, ignoreWarning):
url = filename
keepFilename=True #set to True to skip double-checking/editing destination filename
verifyDescription=False #set to False to skip double-checking/editing description => change to bot-mode
targetSite = wikipedia.getSite('commons', 'commons')
bot = upload.UploadRobot(url, description=desc, useFilename=pagetitle, keepFilename=keepFilename, verifyDescription=verifyDescription, targetSite = targetSite,ignoreWarning=ignoreWarning)
bot.upload_image(debug=True)
# Open a url defensively, created to cope with a very poor internet connection
def urltry(u):
headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
countErr=0
x=''
while x=='':
try:
req = urllib2.Request(u,None,headers)
x = urllib2.urlopen(req)
time.sleep(1)
except:
x=''
countErr+=1
if countErr>300: countErr=300 # 5 minutes between read attempts, eventually
print '** ERROR',countErr,'\n ** Failed to read from '+Yellow+u+Cyan+'\n ** Pause for '+str(countErr*1)+' seconds and try again ['+time.strftime("%H:%M:%S")+']',White
time.sleep(1*countErr)
return x
# Read text from url defensively
def htmlreadtry(x,u):
countErr=0
r=True
while r:
try:
return x.read()
except:
x=urltry(u)
countErr+=1
if countErr>200:
p=300
else:
p=countErr*2
print '** ERROR',countErr,'\n ** Failed to read xml/html'
if countErr==1:
print 'xml ='+str(x)
print 'url ='+u
print ' ** Pause for '+str(p)+' seconds and try again'
time.sleep(p)
else:
r=False
return
skip=0
if len(argv)>1:
skip=int(float(argv[1])) # Not working on windows?
category=u"Airliners.net photos (watermarked)"
site = wikipedia.getSite('commons', 'commons')
cat = catlib.Category(site,u'Category:'+category)
gen = pagegenerators.CategorizedPageGenerator(cat,recurse=False)
workdir="D:/" # Scratch USB stick, change to where working directory is
count=0
uploadcount=0
for i in gen:
count+=1
if count<=skip:
continue
# test for watermark template
html=i.get()
if html.find("{{watermark}}")==-1:
print "No watermark template found in",i.title()
continue
print count,"Downloading",i.title()
fn=re.sub("File:","",i.title()) # Source file name
fnout=fn[0:-4]+"_crop"+fn[-4:] # Cropped file name (locally)
# discover where full size image is starting with image page title
api="http://commons.wikimedia.org/w/api.php?action=query&prop=imageinfo&iiprop=url&format=xml&titles=File:"+urllib.quote(fn)
url=urltry(api)
xml=htmlreadtry(url,api)
if xml.find("<ii url")==-1:
print "Problem finding url from",api
continue
source=xml.split('<ii url="')[1].split('"')[0]
localfile=workdir+fn
urllib.urlretrieve(source, localfile) # Dowload image
# test and crop
Im=Image.open(localfile)
pix=Im.load()
height=Im.size[1]
width=Im.size[0]
pxtest=False
if sum(pix[0,height-12])<30 and sum(pix[0,height-13])>30:
pxtest=True
else:
if sum(pix[0,height-12])<30 and sum(pix[int(width/2),height-12])<30 and sum(pix[int(width/2),height-13])>30:
print "Relied on second pixel test"
pxtest=True
if pxtest:
Imnew=Im.crop((0,0,width,height-12))
Imnew.save(workdir+fnout,'jpeg',quality=98) # quality=99 makes file 40 to 50% larger, 98 only slightly larger or slightly smaller
else:
print "Failed?",0,height-12,"=",pix[0,height-12],0,height-13,"=",pix[0,height-13]
localfile="D:\\"+fn
remove(localfile)
print "_"*60
sleep(10)
continue
# upload
html=re.sub("\{\{watermark\}\}\n*","",html)
html=re.sub("\n*\[\[Category:Airliners.net photos .watermarked.\]\]","",html)
comment="Crop bottom 12 pixels to remove watermark ("+str(width)+"x"+str(height-12)+")"
up(workdir+fnout,"File:"+fn,comment,True) # upload image
wikipedia.setAction("Remove {{watermark}} after image crop")
i.put(html) # upload revised text
localfile="D:\\"+fn
remove(localfile)
uploadcount+=1
print "Total",uploadcount,"_"*(64-len(str(uploadcount)))
sleep(30) # lag to ensure human oversight