User:Thumbnails Check Bot/stage-1.sh

From Wikimedia Commons, the free media repository
Jump to: navigation, search
#!/bin/bash
 
# add --silent as first param ($1) to see not some debug infos
 
workDir="/home/Thumbnails_Check_Bot"
 
lastRun="${workDir}/stage-1-last.log"
failLog="${workDir}/stage-1-fail.log"
 
# user-agent
userAgent="Thumbnails_Check_Bot/0.1 (http://commons.wikimedia.org/wiki/User:Thumbnails_Check_Bot; beta)"
 
# number of files each request (should not greater than 100 to prevent the length of the GET request)
limit=50
 
# do not scan files newer than N seconds (48h)
skipRecent=172800
 
# sleeptime (s) before second check if first was not found a valid thumbnail
sleepTime=30
 
CommonsUser="Thumbnails Check Bot"
CommonsPasswd="xxx"
 
#--------------------------------------------------------
 
# prepare Counter
countPages=0
countAll=0
countValid=0
countInvalid=0
countMissing=0
headError=0
detailsError=0
listError=0
countUnknown=0
startTime="`date --utc +%s`"
mimeLog="`tempfile`"
mimeLogFail="`tempfile`"
tmpfile="`tempfile`"
 
function writelog {
	echo "startTime:${startTime}"
	echo "countPages:${countPages}"
	echo "countAll:${countAll}"
	echo "countValid:${countValid}"
	echo "countInvalid:${countInvalid}"
	echo "countUnknown:${countUnknown}"
	echo "countMissing:${countMissing}"
	echo "headError:${headError}"
	echo "detailsError:${detailsError}"
 
	if test -e "${mimeLog}" ; then 
		cat "${mimeLog}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
			c="`echo \"${line}\" | cut -d ' ' -f 1`"
			m="`echo \"${line}\" | cut -d ' ' -f 2`"
			echo "countMimetype:${m}:${c}"
		done
		rm "${mimeLog}"
	fi
 
	if test -e "${mimeLogFail}" ; then 
		cat "${mimeLogFail}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
			c="`echo \"${line}\" | cut -d ' ' -f 1`"
			m="`echo \"${line}\" | cut -d ' ' -f 2`"
			echo "countMimetypeFail:${m}:${c}"
		done
		rm "${mimeLogFail}"
	fi
 
	# cleanup
	test -e "${tmpfile}.head" && rm "${tmpfile}.head"
	test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"
	test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
	test -e "${tmpfile}.details" && rm "${tmpfile}.details"
	test -e "${tmpfile}.addcat" && rm "${tmpfile}.addcat"
	test -e "${tmpfile}" && rm "${tmpfile}"	
 
	endTime="`date --utc +%s`"
	echo "endTime:${endTime}"
	echo "runTime:`expr ${endTime} - ${startTime}`"
 
}
 
debug=true
if [ "$1" == "--silent" ] ; then
	debug=false
fi
 
nextTime="`expr ${startTime} - ${skipRecent}`"
lastStart="`cat \"${lastRun}\"`"
while [ "${nextTime}" -gt "`expr ${lastStart} - ${skipRecent}`" ] ; do
 
	countPages="`expr ${countPages} + 1`"
	if curl --user-agent "${userAgent}" --get --silent --data "action=query" --data "list=allimages" --data "format=txt" --data "aidir=descending" --data "aisort=timestamp" --data "aiprop=mime" --data "ailimit=${limit}" --data "aistart=${nextTime}" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}" ; then 
 
		titles="`cat \"${tmpfile}\" |  sed 's/^ *//g' | grep '^\[title\] => '| cut -d ' ' -f 3- | grep -a '^File:' | tr '[\n]' '[|]' | sed 's/|$//g'`"
		if curl --user-agent "${userAgent}" --get --silent --data-urlencode "titles=${titles}" --data "action=query" --data "format=txt" --data "prop=imageinfo" --data "iiprop=url|mime" --data "iiurlwidth=120" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}.details" ; then
 
			cat "${tmpfile}.details" | sed "s/^ *//g" | sed 's/^\[missing\] =>\(.*\)/[thumburl] => \n[mime] => \n/g' | grep '^\[\(title\|thumburl\|mime\)\] => ' > "${tmpfile}.formated"
 
			for (( nr=1; nr<=${limit}; nr++ )) ; do
 
				countAll="`expr ${countAll} + 1`"
 
				title="`cat \"${tmpfile}.formated\" | grep '^\[\title\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
				thumburl="`cat \"${tmpfile}.formated\" | grep '^\[\thumburl\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
				mime="`cat \"${tmpfile}.formated\"  | grep '^\[\mime\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
 
				echo "${mime}" >> "${mimeLog}"
 
				if [ "${thumburl}" != "" ] && [ "${mime}" != "" ] ; then
 
					if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head" ; then
 
						http_code="`cat \"${tmpfile}.head\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
						if [ "${http_code}" != "200" ] ; then
 
							# sleep and check again
							sleep $sleepTime
							if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head.2nd" ; then
 
								http_code_2nd="`cat \"${tmpfile}.head.2nd\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
								if [ "${http_code_2nd}" != "200" ] ; then
 
									${debug} && echo "[-] ${title}"
									echo "${http_code} ${http_code_2nd} ${mime} ${title}" >> "${failLog}"
									echo "${mime}" >> "${mimeLogFail}"
									countInvalid="`expr ${countInvalid} + 1`"
 
									if python2.7 pywikipedia/login.py -test | grep -i -m 1 "not logged in" &> /dev/null ; then
										python2.7 pywikipedia/login.py -user:"${CommonsUser}" -pass:"${CommonsPasswd}"
									fi
 
									# add cat
									echo "${title}" > "${tmpfile}.addcat"
									python2.7 pywikipedia/add_text.py -text:"[[Category:Possibly files without thumbnails detected by bot]]" -except:"\[\[Category:Possibly files without thumbnails detected by bot\]\]" -summary:"Bot: I did not found a valid thumbnail, so I add a category. (HTTP status code was ${http_code_2nd})" -file="${tmpfile}.addcat" -always
									rm "${tmpfile}.addcat"
 
								else 
									${debug} && echo "[+] ${title}"
									countValid="`expr ${countValid} + 1`"
								fi
 
							else
								${debug} && echo "[?] ${title}"
								countUnknown="`expr ${countUnknown} + 1`"
								#echo "header nicht ermittelbar"
							fi
							test -e "${tmpfile}.head.2nd" && rm "${tmpfile}.head.2nd"
						else 
							${debug} && echo "[+] ${title}"
							countValid="`expr ${countValid} + 1`"
						fi
					else
						${debug} && echo "[?] ${title}"
						countUnknown="`expr ${countUnknown} + 1`"
						#echo "header nicht ermittelbar"
					fi
					test -e "${tmpfile}.head" && rm "${tmpfile}.head"
 
				else
					countMissing="`expr ${countMissing} + 1`"
					#echo "missing file"
				fi
 
			done
			test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"
 
		else
			detailsError="`expr ${detailsError} + 1`"
			#echo "imagedetails nicht downloadbar"
			writelog
			exit 1
		fi
		test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
		test -e "${tmpfile}.details" && rm "${tmpfile}.details"
	else
		listError="`expr ${listError} + 1`"
		#echo "liste nicht downloadbar"
		writelog
		exit 1
	fi
 
	nextStart="`tac ${tmpfile} | grep -m 1 -i -a \"\[aistart\]\" | sed 's/^ *//g' | cut -d ' ' -f 3-`"
	nextTime="`date --utc --date=${nextStart} +%s`"
	test -e "${tmpfile}" && rm "${tmpfile}"	
 
done
 
writelog
 
echo ${startTime} > "${lastRun}"
exit 0