User:Inkwina/catlistcount3.py

From Wikimedia Commons, the free media repository
Jump to: navigation, search

#!/usr/bin/python
# -*- coding: UTF-8  -*-

import shelve
import urllib
import simplejson
import time
import re
import mwclient


Howmany=200
iconpix=128*128
iconsize=10*1024
Whichcategory='Category:Vector version available'
Wheretosave=u'User:Inkwina/Top 200 Non-Icons which have a Vector version available by usage'
shelffile="./catlistcount3.cache"

# http://commons.wikimedia.org/w/api.php?format=jsonfm&action=query&generator=categorymembers&gcmtitle=Category:Vector%20version%20available&prop=imageinfo&iiprop=size&gcmnamespace=6
apiurl="http://commons.wikimedia.org/w/api.php"
apiparams={'format': "json",
                    'action': "query",
                    'prop': 'imageinfo',
                    'iiprop': 'size',
                    'generator': 'categorymembers',
                    'gcmnamespace': '6',
                    'gcmlimit': "50",
                    'gcmtitle': Whichcategory 
                    }
checkusageurl="http://tools.wikimedia.de/~daniel/WikiSense/CheckUsage.php"
checkusageparams={                'i': '', #filename
                                  'w': '_wp_20', #which wikis to check (top 20 wikipedias not to kill server)
                                  'x': 'main',    #what kind of pages
                                  'r': 'on',        #RAW
                                  'b': '1'          # not Bulk, we check 1 by 1
                                  }
wikire=re.compile('\s*\[([^\]]*)\]\s*(\d*)')

##########

datastore=shelve.open(shelffile, writeback=True)
if not ("items" in datastore): #newfile
    datastore["items"]={}
###    datastore["wikis"]={}
    datastore["all-done"] = False
else:
    apiparams["gcmcontinue"]=datastore["query-continue"] #pick up where we left last time
    print "Continuing from: ",apiparams["gcmcontinue"]

while not datastore["all-done"]:
    checkusageparams['i']=''
    query=urllib.urlopen(apiurl,urllib.urlencode(apiparams))
    data=simplejson.load(query)
    for item in data["query"]["pages"].itervalues():
           # print item['imageinfo'][0]['size']
           try:
                   itemsize=item['imageinfo'][0]['width']*item['imageinfo'][0]['height'] #imageinfo returns a list with one dictionary. bleh!!
                   if ((itemsize>iconpix) & (int(item['imageinfo'][0]['size'])>iconsize) ):  #pick out non-icons   
                        Fname=item["title"].split(':')[-1]
                        Fname=Fname.encode('UTF-8').replace(' ','_')
                        checkusageparams['i'] += Fname+"\n"
                        datastore["items"][Fname]={"countof": {}, "counttotal": 0, "checked": False}
                        print "Added: "+Fname
                   else:
                        print "Skipped: "+item["title"].split(':')[-1]
           except KeyError:
                        print "Gagged on: ",item
           datastore.sync()
    print "--- Cached Data ---"
    print "From: "+data["query"]["pages"].values()[1]["title"]
    print "To: "+data["query"]["pages"].values()[-1]["title"]
    
    if  "query-continue" in data:
        apiparams["gcmcontinue"]=data["query-continue"]["categorymembers"]["gcmcontinue"].encode("UTF-8")
        datastore["query-continue"] =apiparams["gcmcontinue"]
        datastore.sync()
    else:
        datastore["all-done"] =True

    query=urllib.urlopen(checkusageurl,urllib.urlencode(checkusageparams))
    for line in query.readlines():
        sulfarini=wikire.match(line)
        if sulfarini != None:
            print sulfarini.group(1)+" : "+sulfarini.group(2)
            whichwiki=sulfarini.group(1)
        else:
            try:
                page,image=line.split()
            except:
                continue
            if whichwiki in datastore["items"][image]["countof"]:
                datastore["items"][image]["countof"][whichwiki]+=1
###                datastore["wikis"][whichwiki]+=1
            else:
                datastore["items"][image]["countof"][whichwiki]=1
###                datastore["wikis"][whichwiki]=1
            datastore["items"][image]["counttotal"]+=1    
            print image+","+whichwiki+","+ str(datastore["items"][image]["countof"][whichwiki])+","+str(datastore["items"][image]["counttotal"])
    datastore.sync()
    time.sleep(2)
    
datastore["wikis"]={}
ftotal=0
for item in datastore["items"].itervalues():
     ftotal+=int(item["counttotal"])
     for w,c in item['countof'].iteritems():
          if w in datastore["wikis"]:
               datastore["wikis"][w]+=c
          else:
               datastore["wikis"][w]=c

output= """
This Page is an Automatically generated list of the 200 most used Non-Icon Images with a
[[:Category:Vector version available|Vector version available]].

For the purpuses of the page an Icon is any image that is less than 128x128px (specifically has less than 16384 pixels)
OR is smaller than 10K (10240 bytes) is size.

The code for making this list is available [[User:Inkwina/catlistcount3.py|here]]
The images are only checked for use in Articles (not talk pages etc.) on the 20 largest wikipedias

--{{User|Inkwina}}
----
[[Category:Vector version available|Top 200 Non-Icons which have a Vector version available by usage]]
"""

output +="\n'''Last Update "+time.strftime("%a, %d %b %Y %H:%M:%S %Z")+"'''\n"

wtotal=0
for x in datastore["wikis"].itervalues(): wtotal+=int(x)
output +=  "\n*Items in Total: "+str(len(datastore["items"]))
output += "\n**Total use(from wikis) : "+str(wtotal)
output += "\n**Total use(from files) : "+str(ftotal)
output += "\n----\n"
wikisort = [(v, k) for k, v in datastore["wikis"].items()]
wikisort.sort()
wikisort.reverse()
for w,v in wikisort:
    output += "\n# "+str(v)+": "+str(w)

def mycmp(x,y):
#    print x+" : "+str(datastore["items"][x]["counttotal"])
    return cmp(datastore["items"][x]["counttotal"],datastore["items"][y]["counttotal"])*-1
    
sortall=[x for x in datastore["items"] ]
sortall.sort(mycmp)
output += "\n<gallery>\n"
for x in range(Howmany):
    output+= "Image:"+sortall[x]
    output+= "|"+str(x+1)+". Used "+str(datastore["items"][sortall[x]]["counttotal"])+" times [[:Image:"+sortall[x]+"]] "
    for w,c in datastore["items"][sortall[x]]["countof"].items():
        output+=" "+str(w)+": "+str(c)+", "
    output=output[:-2]+".\n"

#count=0
#for k,v in datastore["items"].items():
#    if  ( v["ns"]==6 and v["counttotal"]<=2 ) :
#        if ( ("commons.wikimedia.org" in v["countof"]  and  v["countof"]["commons.wikimedia.org"]<=2)  or v["counttotal"]==0 ) :
#            output+= "Image:"+k
#            output+= "| Used "+str(v["counttotal"])+" times [[:Image:"+k+"]]\n"
#            count+=1
output+= "</gallery>\n"
    
print output
#print count

site = mwclient.Site('commons.wikimedia.org')
site.login("", "")
page = site.Pages[Wheretosave]
page.save(output, summary = u'Inkwina Bot Update')