User:Sz-iwbot/tineyecode

From Wikimedia Commons, the free media repository
Jump to: navigation, search
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
 This script check new images copyright at commons, with tineye.

"""
#
# (C) Shizhao, 2009
#
# Distributed under the terms of the MIT license.
#
#

import re,sys,wikipedia,urllib,urllib2, datetime,time,query,string

mysite = wikipedia.getSite()

def FormatPage(html):
    """format webpage"""
    RAW_BEGIN = '<title>'
    RAW_END = '<link'

    iStart = html.find(RAW_BEGIN)
    iEnd = html.find(RAW_END)
    if iStart >= 0 and iEnd >= 0:
        html = html[iStart:iEnd]
    else:
        raise "not found CONTENTS"
    # one line
    def oneline(matchobj):
        return matchobj.group(0).replace('\n', '')
    html = re.sub(r'(?ms)<title>.*?</title>', oneline, html)
    # remove spaces
    html = re.sub(r'\n[ \t]+', r'\n', html)
    return html

#Check at tineye
def bot(tineyetext, imagepage):
    try:
        results = re.compile(r'^<title>(\d+)\D+</title>') 
        resultsnumber = int(results.search(tineyetext).groups()[0])
        if resultsnumber == 0:
            wikipedia.output(u'[[%s]] seems ok, similar images not found in tineye.' % imagepage.title())
        elif resultsnumber > 0:
            if "Similarimages" not in imagepage.templates():
                page=wikipedia.Page(mysite,u'User:Sz-iwbot/tineye')
                text=page.get()
                text=text+u"\n{{User:Sz-iwbot/sd|image= "+imagepage.title()+u" |n= " + str(resultsnumber) +u" |url= " + tineyeurl + " }}"
                wikipedia.output(u'[[%s]] found %s similar images in tineye.' % (imagepage.title(), resultsnumber))
                page.put(text, u"Bot: Found similar images in tineye.")
                text=""
#                imagetext = imagepage.get()
#                imagetext = imagetext + '\n{{Similarimages|n=' + str(resultsnumber) + u'|url=' + tineyeurl + u'}}'
#                wikipedia.output(u'Tag image...')
#                imagepage.put(imagetext, u"Bot: Tag template. Found similar images in tineye.")
#                imagetext = ""
            else:
                wikipedia.output(u'[[%s]] have tag on image. pass...' % imagepage.title())
            
    except AttributeError:
         wikipedia.output(u'ERROR: can\'t fetching images [[%s]]' % imagepage.title())
    wikipedia.output(u'...Sleep, waiting 3 s...\n')
    time.sleep(3)



seen = set()
while True:
    if seen == set():
        wikipedia.output(u'>>>Loading the new images<<<\n')
    else:
        print '----- Current time:', datetime.datetime.now()  
        print '>>>Waiting load new images<<<\n'
#        time.sleep(60)
   # clean page
    texttemp=""
    pagetemp = wikipedia.Page(mysite,u'User:Sz-iwbot/tineye')
    texttemp = pagetemp.get()
    params = {
        'action'    :'query',
        'prop'      :'images',
        'titles'   :'User:Sz-iwbot/tineye',
        'imlimit'    :'5000',
    }
    imagedata = query.GetData(params, useAPI = True, encodeTitle = False)
    imagegroup = imagedata['query']['pages']['6608033']['images']
    for imagetitle in imagegroup:
        image = wikipedia.Page(mysite,imagetitle['title'])
        rimage=string.replace(imagetitle['title'], '(', '\(')
        rimage=string.replace(rimage, ')', '\)')
        rimage=string.replace(rimage, '+', '\+')
        
        if not image.exists():
            r= re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= %s \|n= .*? \}\}' % rimage)
            texttemp=wikipedia.replaceExcept(texttemp, r, '', exceptions='')
            wikipedia.output(u'remove [[%s]], it have not exit.' % imagetitle['title'])
#        elif "Similarimages" not in image.templates():
#            imagetext = image.get()
#            imagetext = imagetext + '\n{{Similarimages}}'
#            image.put(imagetext, u"Bot: undo Similarimages template. plese check.")
#            wikipedia.output(u'tag template on [[%s]], plese check.' % imagetitle['title'])
#            imagetext = ""
#        elif image.get().find('{{Similarimages|checked}}') > -1 or image.get().find('{{Similarimages|check}}') > -1:
#            r= re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= %s \|n= .*? \}\}' % rimage)
#            texttemp=wikipedia.replaceExcept(texttemp, r, '', exceptions='')
#            wikipedia.output(u'remove [[%s]], it have checked.' % imagetitle['title'])
                    
    rb = re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= \|n=.*?\}\}') 
    texttemp=wikipedia.replaceExcept(texttemp, rb, '', exceptions='') 
    wikipedia.output(u'cleaning...')
    if texttemp == pagetemp.get():
        wikipedia.output(u'>>>Page not change, not clean!<<<\n')
    else:
        pagetemp.put(texttemp, 'clean up')
        wikipedia.output(u'>>>CLEAN OK.<<<\n')
    blacklist=wikipedia.Page(mysite,u'User:Sz-iwbot/tineye/blacklist')
    for (imagepage, timestamp, user, comment) in mysite.newimages(number = 50):
        # no have checked images
        buser = u'User:'+user
        userp=wikipedia.Page(mysite,buser)
        if userp in blacklist.linkedPages():
       
            wikipedia.output(u'Uploader %s in blacklist, pass...' % user)
            bpass=True
        else:
            bpass=False
        if imagepage.title() not in seen:
            seen.add(imagepage.title())
#            thumburl= mysite.protocol() + u"://" + mysite.hostname() + mysite.scriptpath() + u"/thumb.php?w="
     
            try:
                # only check these images
                r= re.compile(r'.*?(?:gif|png|jpg|jpeg)$', re.IGNORECASE)
                passtemplate= [u'Flickrreview', u'PD-old', u'RetouchedPicture', u'PD-1923', u'PD-URAA', u'Not-PD-US-URAA', u'PD-old-100', u'PD-old-80', u'PD-old-75', u'PD-old-70', u'PD-old-50', u'PD-user-en', u'PD-user-w', u'patent', u'anonymous work', 'anonymous-EU', u'PD-anon-1923', u'PD-Art', u'PD-retouched-user', u'User:Flickr upload bot/upload', u'BotMoveToCommons', u'PD-Art-YorckProject', u'PermissionOTRS', u'PD-Coa-Hungary', u'PD-USGov-NASA', u'PD-Polish', u'PD-USGov-USDA-ARS', u'PD-RusEmpire', u'PD-USGov', u'LOC-image', u'PD-Bain', u'Historical blank world maps', u'Duplicate', u'PD-LT-exempt', u'PD-USGov-Military-Army', u'PD-US', u'Location']
    
                l= False
                for pt in passtemplate:
                    if pt in imagepage.templates():
                        l = True
                        wikipedia.output(u'{{%s}} in imagepage, pass...' % pt)
                        break
                
                if r.search(imagepage.title()).groups() == () and (l== False or imagepage.getFileVersionHistory() < 2):
                    #user group API data
                    params = {
                        'action'    :'query',
                        'list'      :'users',
                        'ususers'   :user,
                        'usprop'    :'groups',
                    }
                    data = query.GetData(params, useAPI = True, encodeTitle = False)
                    group = data['query']['users'][0]    


                    # No check sysop or bot upload             
                    if len(group) == 1 and bpass==False:
                        try:
                            # get image width
#                            resolutions = wikipedia.ImagePage.getFileVersionHistory(imagepage)[0][2]
                            params = {
                                'action'    :'query',
                                'prop'      :'imageinfo',
                                'titles'   :imagepage.title(),
                                'iiprop'    :'size|url',
                             }
                            idata = query.GetData(params, useAPI = True, encodeTitle = False)
                            try:
                                width = str(idata['query']['pages'].values()[0]['imageinfo'][0]['width'])
#                                thumburl=thumburl+width+u"&f="

                                if width > 300:
                                    imagehtml = imagepage.getImagePageHtml()
                                    rurl=re.compile(r'<div class="fullImageLink" id="file"><a href=".*?"><img alt=".*?" src="(?P<url>[^ ]+?)" width')
                                    try:
                                        thumburl = rurl.search(imagehtml).group('url')
                                        tineyeurl="http://tineye.com/search?url="+urllib2.quote(thumburl)
                                        wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
                                        html = urllib.urlopen(tineyeurl).read()
                                        tineyetext=FormatPage(html)
                                
                                        bot(tineyetext, imagepage)
                                    except wikipedia.NoPage:
                                        wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
                                else:
                                    try:
                                        imageurl = idata['query']['pages'].values()[0]['imageinfo'][0]['url']
                                
                                        tineyeurl="http://tineye.com/search?url="+urllib2.quote(imageurl)
                                        wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
                                        html = urllib.urlopen(tineyeurl).read()
                                        tineyetext=FormatPage(html)
                                
                                        bot(tineyetext, imagepage)
                                    except wikipedia.NoPage:
                                        wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
                            except KeyError:
                                wikipedia.output(u'KeyError')
                        except IndexError:
                            wikipedia.output(u'ERROR: Not found resolution of image [[%s]]' % imagepage.title())
                            try:
                                imageurl = wikipedia.ImagePage.fileUrl(imagepage)
                                tineyeurl="http://tineye.com/search?url="+urllib2.quote(imageurl)
                                wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
                                try:
                                    html = urllib.urlopen(tineyeurl).read()
                                    tineyetext=FormatPage(html)
                            
                                    bot(tineyetext, imagepage)
                                except IOError:
                                    wikipedia.output(u"Skipping [[%s]] because IOError." % imagepage.title())
                            except wikipedia.NoPage:
                                wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
                    elif len(group) != 1:
                        wikipedia.output(u"Skipping [[%s]] because bot or sysop upload ." % imagepage.title())
            except AttributeError:
                wikipedia.output(u"Skipping [[%s]] because can\'t check at tineye." % imagepage.title())
  
  
wikipedia.stopme()