User:Inkwina/catlistcount2.py

From Wikimedia Commons, the free media repository
Jump to: navigation, search
#!/usr/bin/python
# -*- coding: UTF-8  -*-

import shelve
import urllib
import simplejson
import time
import re
import mwclient


Howmany=200
Whichcategory='Category:Vector version available'
Wheretosave=u'Top 200 Images which have a Vector version available by usage'

shelffile="./catlistcount2.cache"
apiurl="http://commons.wikimedia.org/w/api.php"
apiparams={'format': "json",
                    'action': "query",
                    'list': "categorymembers",
                    'cmlimit': "50",
                    'cmprop': 'title',
                    'cmtitle': Whichcategory 
                    }
checkusageurl="http://tools.wikimedia.de/~daniel/WikiSense/CheckUsage.php"
checkusageparams={'i': '', #filename
                                  'w': '_wp_20', #which wikis to check (top 20 wikipedias not to kill server)
                                  'x': 'main',    #what kind of pages
                                  'r': 'on',        #RAW
                                  'b': '1'          # not Bulk, we check 1 by 1
                                  }
wikire=re.compile('\s*\[([^\]]*)\]\s*(\d*)')

datastore=shelve.open(shelffile, writeback=True)
if not ("items" in datastore): #newfile
    datastore["items"]={}
###    datastore["wikis"]={}
    datastore["all-done"] = False
else:
    apiparams["cmcontinue"]=datastore["query-continue"] #pick up where we left last time

while not datastore["all-done"]:
    checkusageparams['i']=''
    query=urllib.urlopen(apiurl,urllib.urlencode(apiparams))
    data=simplejson.load(query)
    for item in data["query"]["categorymembers"]:
        Fname=item["title"].split(':')[-1]
        Fname=Fname.encode('UTF-8').replace(' ','_')
        datastore["items"][Fname]={"ns": item["ns"]}
        if item["ns"] == 6: #pick out Image:
            checkusageparams['i'] += Fname+"\n"
            datastore["items"][Fname]["countof"]={}
            datastore["items"][Fname]["counttotal"]=0
            datastore["items"][Fname]["checked"]=False
            print "Added: "+Fname
    datastore.sync()
    print "--- Cached Data ---"
    print "From: "+data["query"]["categorymembers"][0]["title"]
    print "To: "+data["query"]["categorymembers"][-1]["title"]
    
    if  "query-continue" in data:
        apiparams["cmcontinue"]=data["query-continue"]["categorymembers"]["cmcontinue"].encode("UTF-8")
        datastore["query-continue"] =apiparams["cmcontinue"]
        datastore.sync()
    else:
        datastore["all-done"] =True

    query=urllib.urlopen(checkusageurl,urllib.urlencode(checkusageparams))
    for line in query.readlines():
        sulfarini=wikire.match(line)
        if sulfarini != None:
            print sulfarini.group(1)+" : "+sulfarini.group(2)
            whichwiki=sulfarini.group(1)
        else:
            try:
                page,file=line.split()
            except:
                continue
            if whichwiki in datastore["items"][file]["countof"]:
                datastore["items"][file]["countof"][whichwiki]+=1
###                datastore["wikis"][whichwiki]+=1
            else:
                datastore["items"][file]["countof"][whichwiki]=1
###                datastore["wikis"][whichwiki]=1
            datastore["items"][file]["counttotal"]+=1    
            print file+","+whichwiki+","+ str(datastore["items"][file]["countof"][whichwiki])+","+str(datastore["items"][file]["counttotal"])
    datastore.sync()
    time.sleep(2)
    
datastore["wikis"]={}
ftotal=0
for item in datastore["items"].itervalues():
    if item['ns']==6: 
        ftotal+=int(item["counttotal"])
        for w,c in item['countof'].iteritems():
            if w in datastore["wikis"]:
                datastore["wikis"][w]+=c
            else:
                datastore["wikis"][w]=c

output= """
This Page is an Automatically generated list of the 200 most used Images with a
[[:Category:Vector version available|Vector version available]]

The code for making this list is available [[User:Inkwina/catlistcount2.py|here]]
The images are only checkd for use in Articles (not talk pages etc.) on the 20 largest wikipedias

--{{User|Inkwina}}
----
[[Category:Vector version available|*  Top 200 by Usage]]
"""

output +="\n'''Last Update "+time.strftime("%a, %d %b %Y %H:%M:%S %Z")+"'''\n"

wtotal=0
for x in datastore["wikis"].itervalues(): wtotal+=int(x)
output +=  "\n*Items in Total: "+str(len(datastore["items"]))
output += "\n**Total use(from wikis) : "+str(wtotal)
output += "\n**Total use(from files) : "+str(ftotal)
output += "\n----\n"
wikisort = [(v, k) for k, v in datastore["wikis"].items()]
wikisort.sort()
wikisort.reverse()
for w,v in wikisort:
    output += "\n# "+str(v)+": "+str(w)

def mycmp(x,y):
#    print x+" : "+str(datastore["items"][x]["counttotal"])
    return cmp(datastore["items"][x]["counttotal"],datastore["items"][y]["counttotal"])*-1
    
sortall=[x for x in datastore["items"] if datastore["items"][x]["ns"]==6]
sortall.sort(mycmp)
output += "\n<gallery>\n"
for x in range(Howmany):
    output+= "Image:"+sortall[x]
    output+= "|"+str(x+1)+". Used "+str(datastore["items"][sortall[x]]["counttotal"])+" times [[:Image:"+sortall[x]+"]] "
    for w,c in datastore["items"][sortall[x]]["countof"].items():
        output+=" "+str(w)+": "+str(c)+", "
    output=output[:-2]+".\n"
#    print output
output+= "</gallery>\n"
    
#print output

site = mwclient.Site('commons.wikimedia.org')
site.login("usernae","passwrd"
page = site.Pages[Wheretosave]
page.save(output, summary = u'Inkwina Bot Update')