User:Thumbnails Check Bot/stage-1.sh

From Wikimedia Commons, the free media repository
Jump to: navigation, search
#!/bin/bash

# add --silent as first param ($1) to see not some debug infos

workDir="/home/Thumbnails_Check_Bot"

lastRun="${workDir}/stage-1-last.log"
failLog="${workDir}/stage-1-fail.log"

# user-agent
userAgent="Thumbnails_Check_Bot/0.1 (http://commons.wikimedia.org/wiki/User:Thumbnails_Check_Bot; beta)"

# number of files each request (should not greater than 100 to prevent the length of the GET request)
limit=50

# do not scan files newer than N seconds (48h)
skipRecent=172800

# sleeptime (s) before second check if first was not found a valid thumbnail
sleepTime=30

CommonsUser="Thumbnails Check Bot"
CommonsPasswd="xxx"

#--------------------------------------------------------

# prepare Counter
countPages=0
countAll=0
countValid=0
countInvalid=0
countMissing=0
headError=0
detailsError=0
listError=0
countUnknown=0
startTime="`date --utc +%s`"
mimeLog="`tempfile`"
mimeLogFail="`tempfile`"
tmpfile="`tempfile`"

function writelog {
	echo "startTime:${startTime}"
	echo "countPages:${countPages}"
	echo "countAll:${countAll}"
	echo "countValid:${countValid}"
	echo "countInvalid:${countInvalid}"
	echo "countUnknown:${countUnknown}"
	echo "countMissing:${countMissing}"
	echo "headError:${headError}"
	echo "detailsError:${detailsError}"

	if test -e "${mimeLog}" ; then 
		cat "${mimeLog}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
			c="`echo \"${line}\" | cut -d ' ' -f 1`"
			m="`echo \"${line}\" | cut -d ' ' -f 2`"
			echo "countMimetype:${m}:${c}"
		done
		rm "${mimeLog}"
	fi

	if test -e "${mimeLogFail}" ; then 
		cat "${mimeLogFail}" | sort | uniq -c | sed 's/^ [ ]*//g' | sed 's/ [ ]*/ /g' | while read line ; do
			c="`echo \"${line}\" | cut -d ' ' -f 1`"
			m="`echo \"${line}\" | cut -d ' ' -f 2`"
			echo "countMimetypeFail:${m}:${c}"
		done
		rm "${mimeLogFail}"
	fi

	# cleanup
	test -e "${tmpfile}.head" && rm "${tmpfile}.head"
	test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"
	test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
	test -e "${tmpfile}.details" && rm "${tmpfile}.details"
	test -e "${tmpfile}.addcat" && rm "${tmpfile}.addcat"
	test -e "${tmpfile}" && rm "${tmpfile}"	

	endTime="`date --utc +%s`"
	echo "endTime:${endTime}"
	echo "runTime:`expr ${endTime} - ${startTime}`"

}

debug=true
if [ "$1" == "--silent" ] ; then
	debug=false
fi

nextTime="`expr ${startTime} - ${skipRecent}`"
lastStart="`cat \"${lastRun}\"`"
while [ "${nextTime}" -gt "`expr ${lastStart} - ${skipRecent}`" ] ; do

	countPages="`expr ${countPages} + 1`"
	if curl --user-agent "${userAgent}" --get --silent --data "action=query" --data "list=allimages" --data "format=txt" --data "aidir=descending" --data "aisort=timestamp" --data "aiprop=mime" --data "ailimit=${limit}" --data "aistart=${nextTime}" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}" ; then 
 		
		titles="`cat \"${tmpfile}\" |  sed 's/^ *//g' | grep '^\[title\] => '| cut -d ' ' -f 3- | grep -a '^File:' | tr '[\n]' '[|]' | sed 's/|$//g'`"
		if curl --user-agent "${userAgent}" --get --silent --data-urlencode "titles=${titles}" --data "action=query" --data "format=txt" --data "prop=imageinfo" --data "iiprop=url|mime" --data "iiurlwidth=120" "http://commons.wikimedia.org/w/api.php" > "${tmpfile}.details" ; then

			cat "${tmpfile}.details" | sed "s/^ *//g" | sed 's/^\[missing\] =>\(.*\)/[thumburl] => \n[mime] => \n/g' | grep '^\[\(title\|thumburl\|mime\)\] => ' > "${tmpfile}.formated"

			for (( nr=1; nr<=${limit}; nr++ )) ; do

				countAll="`expr ${countAll} + 1`"
	
				title="`cat \"${tmpfile}.formated\" | grep '^\[\title\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
				thumburl="`cat \"${tmpfile}.formated\" | grep '^\[\thumburl\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"
				mime="`cat \"${tmpfile}.formated\"  | grep '^\[\mime\] => ' | sed -n \"${nr}p\" | cut -d ' ' -f 3-`"

				echo "${mime}" >> "${mimeLog}"

				if [ "${thumburl}" != "" ] && [ "${mime}" != "" ] ; then

					if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head" ; then

						http_code="`cat \"${tmpfile}.head\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
						if [ "${http_code}" != "200" ] ; then

							# sleep and check again
							sleep $sleepTime
							if curl --user-agent "${userAgent}" --silent --head "${thumburl}" 2>/dev/null > "${tmpfile}.head.2nd" ; then

								http_code_2nd="`cat \"${tmpfile}.head.2nd\" | grep -m 1 '^HTTP/' | cut -d ' ' -f 2`"
								if [ "${http_code_2nd}" != "200" ] ; then

									${debug} && echo "[-] ${title}"
									echo "${http_code} ${http_code_2nd} ${mime} ${title}" >> "${failLog}"
									echo "${mime}" >> "${mimeLogFail}"
									countInvalid="`expr ${countInvalid} + 1`"

									if python2.7 pywikipedia/login.py -test | grep -i -m 1 "not logged in" &> /dev/null ; then
										python2.7 pywikipedia/login.py -user:"${CommonsUser}" -pass:"${CommonsPasswd}"
									fi

									# add cat
									echo "${title}" > "${tmpfile}.addcat"
									python2.7 pywikipedia/add_text.py -text:"[[Category:Possibly files without thumbnails detected by bot]]" -except:"\[\[Category:Possibly files without thumbnails detected by bot\]\]" -summary:"Bot: I did not found a valid thumbnail, so I add a category. (HTTP status code was ${http_code_2nd})" -file="${tmpfile}.addcat" -always
									rm "${tmpfile}.addcat"

								else 
									${debug} && echo "[+] ${title}"
									countValid="`expr ${countValid} + 1`"
								fi

							else
								${debug} && echo "[?] ${title}"
								countUnknown="`expr ${countUnknown} + 1`"
								#echo "header nicht ermittelbar"
							fi
							test -e "${tmpfile}.head.2nd" && rm "${tmpfile}.head.2nd"
						else 
							${debug} && echo "[+] ${title}"
							countValid="`expr ${countValid} + 1`"
						fi
					else
						${debug} && echo "[?] ${title}"
						countUnknown="`expr ${countUnknown} + 1`"
						#echo "header nicht ermittelbar"
					fi
					test -e "${tmpfile}.head" && rm "${tmpfile}.head"

				else
					countMissing="`expr ${countMissing} + 1`"
					#echo "missing file"
				fi

			done
			test -e "${tmpfile}.formated" && rm "${tmpfile}.formated"

		else
			detailsError="`expr ${detailsError} + 1`"
			#echo "imagedetails nicht downloadbar"
			writelog
			exit 1
		fi
		test -e "${tmpfile}.titles" && rm "${tmpfile}.titles"
		test -e "${tmpfile}.details" && rm "${tmpfile}.details"
	else
		listError="`expr ${listError} + 1`"
		#echo "liste nicht downloadbar"
		writelog
		exit 1
	fi

	nextStart="`tac ${tmpfile} | grep -m 1 -i -a \"\[aistart\]\" | sed 's/^ *//g' | cut -d ' ' -f 3-`"
	nextTime="`date --utc --date=${nextStart} +%s`"
	test -e "${tmpfile}" && rm "${tmpfile}"	

done

writelog

echo ${startTime} > "${lastRun}"
exit 0