User:Smallbot/source/Oregon Historical County Records Guide

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search

The following is the source code used for Commons:Bots/Requests/Smallbot 2.

You need XP+, IE6+ (for mshtml), and .net 2+ (for fiddler).

The download is done in vbscript w/ xhr for the http requests, mshtml for the html parsing, ADOB.stream for the, Msxml2.DOMDocument for the xml. Images are encoded as base64 (which inflates their size ~33%) so they can be stored as plain text in the xml file.

XHR-> Microsoft.XMLHTTP (MSXML2.XMLHTTP.3.0)

Source

[edit]

The source code is released under the MIT license


This file is licensed under the Expat License, sometimes known as the MIT License:

Copyright © 2012 Smallman12q

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

The Software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the Software or the use or other dealings in the Software.


Download

[edit]
Const oregonxml = "C:\Oregon.xml" 'XML file where data is stored
Dim html: Set html = CreateObject("htmlfile") 'MSHTML.HTMLDocument
Dim http: Set http = CreateObject("Microsoft.XMLHTTP") 'MSXML2.XMLHTTP.3.0
Dim xmldoc: Set xmldoc = CreateObject("Msxml2.DOMDocument")

'Write to file after every image
Const ForAppending = 8
Dim fs: Set fs = CreateObject("Scripting.FileSystemObject")
Dim file: Set file = fs.OpenTextFile(oregonxml, ForAppending, True,0) '0=ANSI, -1=Unicode

Dim counties: counties = Array("Baker", "Benton", "Clackamas", "Clatsop", "Columbia", "Coos", "Crook", "Curry", "Deschutes", "Douglas", "Gilliam", "Grant", _
 	"Harney", "Hood River", "Jackson", "Jefferson", "Josephine", "Klamath", "Lake", "Lane", "Lincoln", "Linn", "Malheur", "Marion", _
 	 "Morrow", "Multnomah", "Polk", "Sherman", "Tillamook", "Umatilla", "Union", "Wallowa", "Wasco", "Washington", "Wheeler", "Yamhill")

Dim countycounter, countyname	 
'For Each county In counties
For countycounter = 0 To 1 'Change here to number of counties
	Set html = Nothing
	Set html = CreateObject("htmlfile") 'New instance
	countyname = counties(countycounter)
	countynamel = Replace(LCase(countyname)," ","")
	
	'Dim county: Set county = xmldoc.createElement("county")
	'xmldoc.appendChild(county)
	file.WriteLine("<county id=""" & countyname & """>")
	
	'''''''''
	'Get number of images from index 
	http.open "GET", "http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/" & countynamel & "/index.html", False
	http.send
	html.write(http.responseText)

	'Get number of images from h2 header: Benton County Scenic Images [88 images]
	Dim h2 : h2= html.getElementsByTagName("h2")(0).innerHTML
	Dim x1 : x1= InStr(h2,"[") + 1
	Dim numberofimages : numberofimages = Mid(h2,x1, InStr(x1,h2," ") - x1)
	
	'html.clear 'Not implemented http://msdn.microsoft.com/en-us/library/aa752567%28v=vs.85%29.aspx
	html.open "about:blank" 'clears
	
	'''''''
	'Scrape and parse data for each image
	Dim imagecounter 'Index starts at 1
	For imagecounter = 1 To numberofimages
		WScript.Echo imagecounter & "/" & numberofimages & " : " & countyname
		Dim image: Set image = xmldoc.createElement("image")
		xmldoc.appendChild(image)
	
		'Load image site
		http.open "GET", "http://arcweb.sos.state.or.us/pages/records/local/county/scenic/" & countynamel & "/" & imagecounter & ".html", False
		http.send
		html.write (http.responseText)
		
		''''''
		'Parse data
		Dim x2 'Integer
		Dim title: title = html.getElementsByTagName("p")(0).innerHTML
		Dim desc: desc = html.getElementsByTagName("p")(3).innerHTML
		x2 = InStr(desc,"(Photo No. ") + 11
		Dim photonumber : photonumber = Mid(desc,x2, Len(desc) - x2)
		desc = RTrim(Mid(desc,1,x2 - 12))
		Dim source: source = html.getElementsByTagName("img")(2).src
		x1=InStrRev(source,"/") + 1
		source = Mid(source,x1)
		source = "http://arcweb.sos.state.or.us/images/records/county/img/scenic/" &countynamel & "/" & source
		
		'Parse out image details
		Dim imagedetails: imagedetails = html.getElementsByTagName("p")(4).innerHTML
		imagedetails = Mid(imagedetails,InStr(imagedetails,"<BR>") + 4)'Remove About this image
		imagedetails = Split(imagedetails, ";", -1, 1) 
		
		'Lazy way of parsing out image details
		Dim detail
		Dim dimensions, datecreated, camera, lens, photographer, keywords
		For Each detail In imagedetails
			lz dimensions,"Dimensions: " 
			lz datecreated,"Date created: " 
			lz camera,"Camera: " 
			lz lens,"Lens: " 
			lz photographer,"Photographer citation: " 
			lz keywords,"Keywords/Tags: " 
		Next
		
		'html.clear
		html.open "about:blank"
		
		''''''''
		'Get image
		http.open "GET", source, False
		http.send
		
		Dim b64 : Set b64 = xmldoc.createElement("imagedata")
		With b64
 			.dataType = "bin.base64"
			.nodeTypedValue = http.responseBody
		End With
 		
		'''''''''
		'Write out nodes
		caddelement "title", title
		caddelement "desc", desc
		caddelement "number", imagecounter
		caddelement "photonumber", photonumber
		caddelement "source", source
		caddelement "dimensions", dimensions
		caddelement "date",datecreated
		caddelement "camera", camera
		caddelement "lens", lens
		caddelement "photographer", photographer
		caddelement "keywords", keywords		
		image.appendChild(b64)
		
		''''''''''
		'Append to file
		file.WriteLine PrettyPrintXml (xmldoc)
		xmldoc.removeChild(image)
		WScript.Sleep 4000
	Next
		file.WriteLine("</county>")
Next


'Lazy split
Sub lz(ByRef tosplit, splitby) 
	Dim s: s=Split(detail,splitby)
	If UBound(s) = 1 Then 'It has 2 items
		tosplit = s(1)
	End If
End Sub

Sub caddelement(elementname, elementvalue) 'image add element
	Dim element: Set element = xmldoc.createElement(elementname)
	element.text = elementvalue
	'Dim value : Set value = xmldoc.createTextNode(elementvalue)
	'element.appendChild(value)  'Set the element value
	image.appendChild(element) 'Add element to county
End Sub

'http://stackoverflow.com/questions/4328907/add-child-entry-to-a-specific-node-in-xml-file-in-vbscipt
Function PrettyPrintXml(xmldoc)
    Dim reader
    set reader = CreateObject("Msxml2.SAXXMLReader.6.0")
    Dim writer
    set writer = CreateObject("Msxml2.MXXMLWriter.6.0")
    writer.indent = True
    writer.omitXMLDeclaration = True
    reader.contentHandler = writer
    reader.putProperty "http://xml.org/sax/properties/lexical-handler", writer
    reader.parse(xmldoc)
    PrettyPrintXml = writer.output
End Function

After the download is complete, prepend the xml file with "<counties>", and append it with "</counties>" to give it a root element.

Tag Name number

[edit]

Javascript snippet for determining tag name/number/content. Run through developer tools in IE, should also work in firebug.

function tagnamenumber()
{
	var w = window.open('', '', 'width=1000,height=500,resizable,scrollbars');
	w.document.open();
	w.document.write("<html><head><title>Viewing image...</title></head>");
	w.document.write("<body onload = 'self.focus()'>");
	
	var HTML4elements = ["A", "ABBR", "ACRONYM", "ADDRESS", "APPLET", "AREA", "B", "BASE", "BASEFONT", "BDO", "BIG", "BLOCKQUOTE", "BODY", "BR", "BUTTON", "CAPTION", "CENTER", "CITE", "CODE", "COL", "COLGROUP", "DD", "DEL", "DFN", "DIR", "DIV", "DL", "DT", "EM", "FIELDSET", "FONT", "FORM", "FRAME", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR", "HTML", "I", "IFRAME", "IMG", "INPUT", "INS", "ISINDEX", "KBD", "LABEL", "LEGEND", "LI", "LINK", "MAP", "MENU", "META", "NOFRAMES", "NOSCRIPT", "OBJECT", "OL", "OPTGROUP", "OPTION", "P", "PARAM", "PRE", "Q", "S", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRIKE", "STRONG", "STYLE", "SUB", "SUP", "TABLE", "TBODY", "TD", "TEXTAREA", "TFOOT", "TH", "THEAD", "TITLE", "TR", "TT", "U", "UL", "VAR"];
	
	//doesn't work piece
	w.document.write("<p>");
	for (var counter0 = 0; counter0 < 90; counter0+=1)
	{
		var element = HTML4elements[counter0];
		w.document.write("<a href=\"#" + element + "\">" + element + "</a>, ");
	}
	w.document.write("</p></br></hr>");
	//
	
	for (var counter = 0; counter < 90; counter+=1)
	{
		var element = HTML4elements[counter];
		var elements = document.getElementsByTagName(element);
		
		if(elements.length > 0)
		{
			
			w.document.write("<h5>" + element + "</h5>");
			
			//start table
			w.document.write("<table border=\"1\" title=\"" + element + "\">");
			w.document.write("<tr>");
			w.document.write("<th>#</th>");
			w.document.write("<th>innerHTML</th>");
			w.document.write("<tr>");

			//write rows
			for (var elementscounter = 0; elementscounter < elements.length; elementscounter+=1)
			{
				w.document.write("<tr>");
				w.document.write("<td name=\"" + elementscounter + "\">" + elementscounter + "</td>");
				w.document.write("<td><pre><code>" + elements[elementscounter].innerHTML.replace(/</g,"&lt;").replace(/>/g,"&gt;") +"</code></pre></td>");
				w.document.write("</tr>");
			}
			
			//close table
			w.document.write("</table>");
			w.document.write("<br/>");
			
		}
	}
	w.document.write("</body></html>");
	w.document.close();
}
[edit]

Provides link number for each link in page...works as above.

function linkorder()
{
	var w = window.open('', '', 'width=1000,height=500,resizeable,scrollbars');
	w.document.open();
	w.document.write("<html><head><title>Links</title></head>");
	w.document.write("<body onload = 'self.focus()'>");
	var links = document.links;
	
	w.document.write("<h5>Document.links</h5>");
			
	//start table
	w.document.write("<table border=\"1\" >");
	w.document.write("<tr>");
	w.document.write("<th>#</th>");
	w.document.write("<th>innerHTML</th>");
	w.document.write("<tr>");

	//write rows
	for(var counter = 0; counter < links.length; counter +=1)
	{
		w.document.write("<tr>");
		w.document.write("<td>" + counter + "</td>");
		w.document.write("<td>" + links[counter].href + "</td>");
		w.document.write("</tr>");	
	}		
	//close table
	w.document.write("</table>");
	w.document.write("<br/>");
}

Upload

[edit]

The source xml file can be found at dropbox (~220MB). One of the dates is wrongly (on their site) spelled as "September 2008".

Const user = "user"
Const pass = "password"
Dim http: Set http = CreateObject("Microsoft.XMLHTTP")

'''''''''''''''
'Can also use https
Dim Console: Set Console = WScript.StdOut

''''Login
Dim x 'node

Console.WriteLine  "Logging in as :" & user 
Console.WriteLine "Logging in... 1/2"
'initial post
wikipost "format=xml&action=login&lgname=" & Escape(user) & "&lgpassword=" & Escape(pass),0
nodeset "//api/login/@result"
If x Is Nothing Then
        Console.WriteLine "Couldn't find initial login token."
        Quit
End If
If x.value <> "NeedToken" Then        
        Console.WriteLine  "Couldn't get initial login token."
        Quit
End If
 
'Repost with token
Console.WriteLine "Logging in... 2/2"
nodeset "//api/login/@token"
wikipost "format=xml&action=login&lgname=" & Escape(user) & "&lgpassword=" & Escape(pass) & "&lgtoken=" & x.value,0
nodeset "//api/login/@result"
If x Is Nothing Then
        WScript.Echo "Couldn't find login result."
        Quit
End If
If x.value <> "Success" Then
        Console.WriteLine "Login failed."
        Quit
End If
Console.WriteLine "Successfully logged in"
 
'''Edit Token
Dim edittoken
'Get edittoken from main talkpage
Console.WriteLine "Retrieving edit token..."
wikipost "format=xml&action=query&prop=info&intoken=edit&titles=Talk:Main%20Page", 0
nodeset("//api/query/pages/page/@edittoken")
If x Is Nothing Then
        Console.WriteLine "Couldn't find edittoken."
        Quit
End If
If x.value = "+\" Then
        Console.WriteLine "Invalid edittoken."
        Quit
End If
edittoken = x.value
Console.WriteLine("Edit token retrieved: " & edittoken)
 
'Multipart variables
Dim boundary: boundary = "8G9lbpohjyr5ewco0ho" 'Should be more random

'''ADODB.Stream
Dim Stream : Set Stream = CreateObject("ADODB.Stream")
'Type
Const adTypeBinary = 1
Const adTypeText = 2
'State
Const adStateClosed = 0
Const adStateOpen = 1



'''''''''''''''''''''
'Actual upload
Dim xDoc: Set xDoc = CreateObject("Msxml2.DOMDocument")
xDoc.validateOnParse = True

Console.WriteLine "Loading xml file"
If xDoc.Load("C:\Oregon.xml") Then ' The document loaded successfully.
	Console.WriteLine "XML file loaded"
	
   Dim root: Set root = xDoc.childNodes
   Dim counties : Set counties = root.item(0).childNodes
   
   Console.WriteLine "There are " & counties.length & " counties."
   
   Dim countycounter
   For countycounter = 31 To counties.length - 1
		Dim countyname : countyname = counties.item(countycounter).Attributes.item(0).text
		Dim countynamefull : countynamefull = countyname & " County, Oregon"
       
       'http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/josephine/index.html
       
       editpage "Category: Scenic images of " & countynamefull, _
             "{{en|1=Scenic images from the of [[w:" & countynamefull & "|" & countynamefull & "]] from the [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/" & Replace(LCase(countyname)," ","") & "/index.html Oregon Historical County Records Guide]}}" &_
             "[[Category: Images from Oregon Historical County Records Guide]]" & vbNewLine &_
             "[[Category: " & countynamefull &"]]", _
              "Creating category for per [[Commons:Bots/Requests/Smallbot 2]]"
		
		Console.WriteLine "County: " & countyname & " ->" & countycounter
		
	   Dim images : Set images = counties.item(countycounter).childNodes
	   
	   Dim imagecounter, imageslength
	   imageslength = images.length - 1
	   For imagecounter = 0 To imageslength
	   		Console.WriteLine "Image: " & imagecounter & "/" & imageslength
	   		
			Dim image: Set image = images.item(imagecounter).childNodes
			
			'Get image data from xml
			Dim title : title = image.item(0).text
			Dim desc: desc = image.item(1).text
			Dim number: number = image.item(2).text
			Dim photonumber: photonumber = image.item(3).text
			Dim source: source = image.item(4).text
			Dim dimensions: dimensions = image.item(5).text
			Dim vdate: vdate = image.item(6).text
			Dim camera: camera = image.item(7).text
			Dim lens: lens = image.item(8).text
			Dim photographer: photographer = image.item(9).text
			Dim keywords: keywords = image.item(10).text
				keywords = Left(keywords, Len(keywords) - 1) 'Remove last .
			'Dim imagedata: imagedata = image.item(11).nodeTypedValue 'In binary
   
   			'Generate template + category
   			Dim template: template = "{{Information " & vbNewLine &_
   				"|Description= {{en|1=" & desc & "}}" & vbNewLine &_
   				"|Date= " & Year(vdate) & "-" & month0(Month(vdate)) & vbNewLine &_
   				"|Source= {{en|1=[http://arcweb.sos.state.or.us/pages/records/local/county/scenic/index.html Oregon Historical County Records Guide]:" &_
   					" [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/" & Replace(LCase(countyname)," ","") & "/" &number & ".html Link]" &_
   					" [" & source & " Direct]}}" & vbNewLine &_
   				"|Author= {{en|1=" & photographer & "}}" & vbNewLine &_
   				"|Permission= {{Oregon Historical County Records Guide}}" & vbNewLine &_
   				"|Other_fields= " & vbNewLine &_
   					"{{Information field|name=Photo No. |value= " & photonumber & "}}" & vbNewLine &_
   					"{{Information field|name=Camera |value= " & camera & "}}" & vbNewLine &_
   					"{{Information field|name=Lens |value= " & lens & "}}" & vbNewLine &_
   					"{{Information field|name=Dimensions |value= [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/buy.html " & dimensions & " Available for purchase from the Oregon State Archives.]}}" & vbNewLine &_
   					"{{Information field|name=Keywords |value= {{en|1=" & keywords & "}}}}" & vbNewLine &_
				"}}" & vbNewLine &_
				vbNewLine &_
				"[[Category: Scenic images of " & countynamefull & "]]"
	   		
	   		'Upload file
	   		multipartpost image.item(11).nodeTypedValue, _
	   			title & " (" & countynamefull & " scenic images" & ") (" & photonumber & ").jpg", _
	   			template, _
	   			"[[Commons:Bots/Requests/Smallbot 2]]: Uploading scenic images from Oregon State Archives"
	   		'Assume success...else check response in fiddler2
	   		'WScript.Quit
	   Next
   
   Next
   
Else ' The document failed to load.
	Console.WriteLine "XML failed to load:"
	
	'http://msdn.microsoft.com/en-us/library/aa468547.aspx
	Dim strErrText, xPe
	Set xPe = xDoc.parseError
	With xPe
		strErrText = "Your XML Document failed to load" & _
			"due the following error." & vbCrLf & _
			"Error #: " & .errorCode & ": " & xPE.reason & _
			"Line #: " & .Line & vbCrLf & _
			"Line Position: " & .linepos & vbCrLf & _
			"Position In File: " & .filepos & vbCrLf & _
			"Source Text: " & .srcText & vbCrLf & _
			"Document URL: " & .url
	End With
	Console.WriteLine strErrText
End If

Function month0 (input)
	If input <10 Then
		month0 = "0" & input
	Else
		month0 = input
	End If
End Function

'''''''''''''''''''''''''''''''''''
Sub wikipost(payload, attempt)
        http.open "POST","http://commons.wikimedia.org/w/api.php",False
        http.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
        http.send (payload)
        If (httpstatus() = false) Then
                If attempt < 3 Then
                        wikipost payload, attempt + 1
                Else
                        Console.WriteLine "POST FAILED|" & payload
                        'Quit, HTTP errors
                End If
        End If
End Sub
 
'Returns false if not 200 response
Function httpstatus()
        If http.status <> 200 Then
                httpstatus = False
        End If
        httpstatus = True
End Function
 
Sub nodeset(node)
        Set x= http.responseXML.selectSingleNode(node)
End Sub

Sub Quit
	Console.WriteLine "Quit"
	WScript.Quit
End Sub
 
Function editpage (page,text,summary)
        wikipost "format=xml&action=edit&title=" & Encode(page) & "&text=" & Encode(text) & "&summary=" & Encode(summary) &"&bot=1"& "&token=" & Encode(edittoken), 0
        nodeset "//api/edit/@result"
 
        If x Is Nothing Then
                'error
                Console.WriteLine "Edit page error->nothing->" & page
                Quit
        End If
        If x.value <> "Success" Then
                'error
                Console.WriteLine "Edit page failure." & page
                Quit
        End If
        editpage = 1
End Function

Sub multipartpost(source, filename, desc, comment)
        http.open "POST","http://commons.wikimedia.org/w/api.php",False
        http.setRequestHeader "Content-Type", "multipart/form-data; boundary=" & boundary
 
        Dim uploaddata
 
        Dim params: params = Array("action","format","filename","text","comment","ignorewarnings","token")
        Dim values: values = Array("upload","xml",filename,desc,comment,"1",edittoken)
        Dim count
        For count=0 To 6
                uploaddata = uploaddata & vbNewLine & "--" & boundary & vbNewLine &_
                "Content-Disposition: form-data; name=" & chr(34) & params(count) & chr(34) & vbNewLine &_
                "Content-Type: text/plain" & vbNewLine &_
                 vbNewLine & values(count)
        Next
 
        'application/octet-stream
        'image/jpg
        http.Send ConcatByteArrays( _
                ConcatByteArrays( _
                        StringToBinary(uploaddata & vbNewLine & "--" & boundary & vbNewLine &_
                                "Content-Disposition: form-data; name=" & chr(34) & "file" & chr(34) &"; filename=" &  chr(34) & filename & chr(34) & vbNewLine &_
                                "Content-Type: application/octet-stream" & vbNewLine &_
                                "Content-Transfer-Encoding: binary" & vbNewLine &_
                                vbNewLine), _
                        source), _
                StringToBinary(vbNewLine &_
                "--" & boundary & "--" & vbNewLine))
End Sub

'http://stackoverflow.com/questions/184574/how-to-append-binary-values-in-vbscript
Function ConcatByteArrays(bytearray1, bytearray2)
 
        ClearStream
        'Open stream and write 1st, 2nd byte array
        Stream.Open
        Stream.Type = adTypeBinary 'Binary
        Stream.Write bytearray1
        Stream.Write bytearray2
 
        Stream.Position = 0 'Reset position to read from start

        ConcatByteArrays = Stream.Read
        'Stream.Close
End Function

Sub ClearStream
        If Stream.State <> adStateClosed Then
                Stream.Close
        End If
End Sub

'http://www.motobit.com/tips/detpg_binasp/
Function StringToBinary(Text)
        ClearStream
 
        'Set as text stream
        Stream.Type = adTypeText
        Stream.CharSet = "us-ascii" 'can change to others

        'Write text to stream
        Stream.Open
        Stream.WriteText Text
 
        'Change stream to binary
        Stream.Position = 0 'Set position to 0 first
        Stream.Type = adTypeBinary
 
        'Open the stream as binary
        StringToBinary = Stream.Read
        'Stream.Close
End Function

'Very primitive escape
Function Encode(unencoded)
	Encode= Replace(Escape(unencoded),"+","%2B")
End Function

'XMLDOM
'http://msdn.microsoft.com/en-us/library/windows/desktop/ms763798%28v=vs.85%29.aspx

Fiddler

[edit]

In Fiddler 2 (requires .net 2), block .css/.js in the static function OnBeforeRequest. Add

 
if (oSession.uriContains(".css")){
    oSession["ui-color"]="orange"; 
    oSession["ui-bold"]="true";
    oSession.oRequest.FailSession(404, "Blocked", "Fiddler blocked CSS file");
}
if (oSession.uriContains(".js")){
    oSession["ui-color"]="orange"; 
    oSession["ui-bold"]="true";
    oSession.oRequest.FailSession(404, "Blocked", "Fiddler blocked JS file");
}

More info at http://www.fiddler2.com/Fiddler/dev/ScriptSamples.asp

  • Disable caching by checking Rules->Performance->Disable caching
  • Specify an appropriate bot UA (user agent) under Rules->User Agents->Custom
  • Move the google analytics .js request to the autoresponder if it isn't blocked already, enable automatic responses, and allow unmatched requests to pass through