User:Panoramio upload bot/code

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Script to upload images of panoramio to wikimedia commons.

by Shizhao 2014
 
"""
import urllib2,re, random
import datetime
from BeautifulSoup import BeautifulSoup
import upload,time, StringIO, hashlib, base64
import tempfile
import pywikibot
import pywikibot.data.api
from pywikibot import config
import sys    
from multiprocessing.dummy import Pool as ThreadPool 
reload(sys) # Python2.5 初始化后会删除 sys.setdefaultencoding 这个方法,我们需要重新载入     
sys.setdefaultencoding('utf-8')
site = pywikibot.Site(u'commons', u'commons')

def cleanUpTitle(title, site, author, project = u'panoramio'):
    ''' Clean up the title of a potential mediawiki page. Otherwise the title of
    the page might not be allowed by the software.
    
    from flickrripper.py

    '''
    maxBytes = 240 - len(project.encode('utf-8')) \
                    - len(author.encode('utf-8'))
    titleBytes = len(title.encode('utf-8'))
    if titleBytes > maxBytes:
        # maybe we cut more than needed, anyway we do it
        items = max(min(len(title), maxBytes / 4),
                    len(title) - titleBytes + maxBytes)
        title = title[:items]
    title = title.strip()
    title = re.sub(u"^ $", u"Untitled", title) 
    title = re.sub(u"[<{\\[]", u"(", title)
    title = re.sub(u"[>}\\]]", u")", title)
    title = re.sub(u"[ _]?\\(!\\)", u"", title)
    title = re.sub(u",:[ _]", u", ", title)
    title = re.sub(u"[;:][ _]", u", ", title)
    title = re.sub(u"[\t\n ]+", u" ", title)
    title = re.sub(u"[\r\n ]+", u" ", title)
    title = re.sub(u"[\n]+", u"", title)
    title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
    title = re.sub(u"[&#%?!]", u"^", title)
    title = re.sub(u"[;]", u",", title)
    title = re.sub(u"[/+\\\\:]", u"-", title)
    title = re.sub(u"--+", u"-", title)
    title = re.sub(u",,+", u",", title)
    title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
    title = title.replace(u" ", u"_")
    if not site:
        site = pywikibot.Site(u'commons', u'commons')
    try:
        if pywikibot.Page(site, u'File:%s - %s.jpg' % (title, project)).exists():
            while True:
                if pywikibot.Page(site, u'File:%s - %s - %s.jpg' % (title, project, author)).exists():
                    i = 1
                    while True:
                        if (pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg' % (title, project, author, i)).exists()):
                            i += 1
                        else:
                            return u'%s - %s - %s (%d).jpg' % (title, project, author, i)
                else:
                    return u'%s - %s - %s.jpg' % (title, project, author)
        else:
            return u'File:%s - %s.jpg' % (title, project)
    except AttributeError:
        return u'File:%s - %s.jpg' % (title, project)	

def checkcountry(cats):
    '''remove country category'''
    for countrycat in cats:
        if u'Countries' not in countrycat.title(withNamespace=False):
            country = False
        else:
            country = True
            break
    return country
    
def checkyear(category):
    '''remove year category'''
    if category.isdigit():
        checkyear = True
    else:
        checkyear = False
    return checkyear

def checkmetacategories(cats):
    '''remove Meta categories'''
    for metacat in cats:
        if u'Meta categories' not in metacat.title(withNamespace=False) or metacat.title(withNamespace=False) <> u'Topics':
            checkmeta = False
        else:
            checkmeta  = True
            break
    return checkmeta 

def checkmaincategory(categorypage):
    '''remove main category'''
    templates = categorypage.templates()
    maincattemp=[u'MetaCat', u'CatDiffuse', u'Categorise', u'CatCat', u'Disambig']
    for temp in templates:
        if temp.title(withNamespace=False) in maincattemp:
            checkmain = True
            break
        else:
            checkmain = False
    return checkmain
        
def buildDescription(Information, site, tags, Location):
    ''' Build the final description for the image.

    '''
    uploder = config.usernames['commons']['commons']
    description = Information
    catexists=len(tags)
    if tags:
        catexists=len(tags)
        catstext = u''
        for category in tags:
            try:
                categorypage=pywikibot.Page(site, u'Category:' + category)
                if categorypage.exists():
                    if categorypage.isCategoryRedirect():
                        cats=categorypage.getCategoryRedirectTarget().categories()
                        if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage):
                            catexists = catexists -1
                            pywikibot.output(u'remove category: ' + category)
                        else:
                            if u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n' not in catstext:
                                catstext = catstext + u'[[Category:' + categorypage.getCategoryRedirectTarget().title(withNamespace=False) + ']]\n'
                                pywikibot.output(u'RedirectTarget: '+categorypage.getCategoryRedirectTarget().title())
                            else:
                                catexists = catexists -1
                                pywikibot.output(u'remove category: ' + category + '. Dupe')
                    else:
                        cats=categorypage.categories()
                        if checkcountry(cats) or checkyear(category) or checkmetacategories(cats) or checkmaincategory(categorypage):
                            catexists = catexists -1
                            pywikibot.output(u'remove category: ' + category)
                        else:
                            catstext = catstext + u'[[Category:' + category + ']]\n'
                else:
                    catexists = catexists -1
                    pywikibot.output(u'[[Category:' + category + ']]' + u' not exists.')
            except:
                catexists = catexists -1
        if catexists == 0:
            if Location ==u'':
                description = description + u'{{subst:unc}}\n'
            else:
                description = description + u'{{subst:unc|geo=1}}\n'        
        else:
            description = description+u'{{subst:chc}}\n\n' + catstext
    else:
        if Location ==u'':
            description = description + u'{{subst:unc}}\n'
        else:
            description = description + u'{{subst:unc|geo=1}}\n'    
       
    uploder = config.usernames['commons']['commons']
    description = description + u'[[Category:Panoramio files uploaded by ' + uploder + ']]\n'
        
    return description
    
def downloadPhoto(photoUrl = ''):
    '''
    Download the photo and store it in a StrinIO.StringIO object.

    TODO: Add exception handling

    '''
    imageFile=urllib2.urlopen(photoUrl).read()
    return StringIO.StringIO(imageFile)

def findDuplicateImages(photo=None,
                        site=pywikibot.Site(u'commons', u'commons')):
    ''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api
    for a list of duplicates.

    TODO: Add exception handling, fix site thing

    '''
    hashObject = hashlib.sha1()
    hashObject.update(photo.getvalue())
    return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))

start = time.time()
add=5
page = pywikibot.Page(site, u"User:Panoramio upload bot/status")
text = page.text
#r=int(text)+1 
r=int(text) 

for arg in pywikibot.handleArgs():
    if arg:
        if arg.startswith('-start:'):
            r = int(arg[7:])
        elif arg.startswith('-range:'):
            add = int(arg[7:])

def runputbot(i):
#for i in range(r,r+add):
    site = pywikibot.Site(u'commons', u'commons')
    url="http://www.panoramio.com/photo/%d" % i
    try:
        page = urllib2.urlopen(url)
        urlverify=True
    except (urllib2.HTTPError, urllib2.URLError), e:
        urlverify=False 
        print 'photo id %d' % i, e
        #sec=random.randint(1, 5)
        #pywikibot.output(u"Waiting for %d seconds." % sec)
        #time.sleep(sec)
    while urlverify:
        soup = BeautifulSoup(page)
        license = soup.find('li', attrs={'class' : re.compile("^license")})['class']
        #print license
        
        if license=="license by-sa" or license== "license by":
            pywikibot.output('photo id %d %s is OK! Ready upload Commons...' % (i, license))
            photo_url = "http://static.panoramio.com/photos/original/%d.jpg" % i
            #print photo_url

        #Should download the photo only once
            trying = True
            n=0
            while trying:           
                try:
                    photo = downloadPhoto(photo_url)
                    trying=False 
                except (urllib2.HTTPError,urllib2.URLError), e:
                    pywikibot.output(e.code + ' try....')
                    n=n+1
                    time.sleep(5*n*n)

        #Don't upload duplicate images, should add override option
            duplicates = findDuplicateImages(photo)
            if duplicates:
                pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
                urlverify=False 
            else:
                      
            #photo title
                title=soup.find("h1", { "id" : "photo-title" }).contents[0].strip().lstrip().rstrip(',')

                #author and author _url
                author=soup.find("a", { "rel" : "author" }).contents[0]
                author_url = soup.find("a", { "rel" : "author" })['href']
                author_url= 'http://www.panoramio.com' + author_url
                
                #clean filename
                filename=cleanUpTitle(title, site, author)
                pywikibot.output(filename,toStdout=True)
                #print str(filename.decode('utf-8'))        
                #tags
                tags=[]
                for tag in soup.findAll(attrs={'id' : re.compile("^tag_element")}):
                    tags.append(tag.a.contents[0].strip().lstrip().rstrip(','))
                try:
                    mapname=soup.find("div", { "id" : "map_info_name" }).a.contents[0]
                    if mapname not in tags:
                        tags.append(mapname)
                    print tags
                except AttributeError:
                    print tags                  
                
                #date
                try: 
                    date=soup.find("li", { "id" : "tech-details" }).findNext('ul').find(text=re.compile("^Taken on"))
                    formatdate=datetime.datetime.strptime(date,'Taken on %Y/%m/%d %X')
                    date = formatdate.strftime('%Y-%m-%d')
                    date=u'{{Taken on|%s}}' % date
                    print date
                except (AttributeError, TypeError):
                    date=soup.find("ul", { "id" : "details" }).li.contents[0].strip().lstrip().rstrip(',')
                    formatdate=datetime.datetime.strptime(date,'Uploaded on %B %d, %Y')
                    date = formatdate.strftime('%Y-%m-%d')
                    date =u'{{Original upload date|%s}}' % date
                    print 'take on not found.', date
                
                #geo
                try:
                    lat =soup.find("abbr", { "class" : "latitude" })['title']
                    lon = soup.find("abbr", { "class" : "longitude" })['title']
                    Location = u'{{Location|%s|%s|source:Panoramio}}' %(lat, lon)
                except TypeError:
                    Location = u''
                if license == "license by-sa":
                    licensetag = u'{{cc-by-sa-3.0|%s}}' % author
                elif license == "license by":
                    licensetag = u'{{cc-by-3.0|%s}}' % author
                reviewer = config.usernames['commons']['commons']  
                review = u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}' % reviewer
                
                #print soup.find("div", { "id" : "photo-description-formatted" }).contents[0]
                
                try:
                    photo_description = u''.join(unicode(item) for item in soup.find("div", { "id" : "photo-description-formatted" }).contents).strip().lstrip().rstrip(',')
                    if photo_description:
                        title=title+u'\n\n'+ photo_description
                    else:
                        pass
                except AttributeError:
                    pass
                Information=u'{{Information\n|description=%s\n|date=%s\n|source=%s\n|author=[%s %s]\n|permission=%s\n%s\n|other_versions=\n|other_fields=\n}}\n%s\n\n' % (title, date, url, author_url, author, licensetag, review, Location)
                #site = pywikibot.Site(u'commons', u'commons')
                Description = buildDescription(Information, site, tags, Location)
                #pywikibot.output(Description)

                bot = upload.UploadRobot(photo_url,
                                   description=Description,
                                   useFilename=filename,
                                   keepFilename=True,
                                   verifyDescription=False,
                                   ignoreWarning=True,uploadByUrl=True)
                uploadtoo = True
                n=0             
                while uploadtoo:    
                    try: 
                        bot.upload_image(debug=True)
                        urlverify=False
                        uploadtoo = False
                        #sec=random.randint(1, 10)
                        #pywikibot.output(u"Finished upload. Waiting for %d seconds." % sec)
                        #time.sleep(sec)
                    except e:
                        n=n+1
                        print e.code
                        #time.sleep(5*n*n)
        else:
            pywikibot.output('photo id %d %s is invalid! Ignore...' % (i, license))
            #sec=random.randint(1, 5)
            #pywikibot.output(u"Waiting for %d seconds." % sec)
            #time.sleep(sec)
            urlverify=False 
# Make the Pool of workers
pool = ThreadPool(4)
pool.map(runputbot, range(r,r+add))
#close the pool and wait for the work to finish 
pool.close() 
pool.join() 

print "Elapsed Time: %s" % (time.time() - start)	
		
#update upload status
i=r+add
statuspage = pywikibot.Page(site, u"User:Panoramio upload bot/status")
statuspage.text = u'%d' % i
statuspage.save(u"update upload status: %d" % i)