User:Smallbot/source/Oregon Historical County Records Guide
The following is the source code used for Commons:Bots/Requests/Smallbot 2.
You need XP+, IE6+ (for mshtml), and .net 2+ (for fiddler).
The download is done in vbscript w/ xhr for the http requests, mshtml for the html parsing, ADOB.stream for the, Msxml2.DOMDocument for the xml. Images are encoded as base64 (which inflates their size ~33%) so they can be stored as plain text in the xml file.
XHR-> Microsoft.XMLHTTP (MSXML2.XMLHTTP.3.0)
Source
[edit]The source code is released under the MIT license
This file is licensed under the Expat License, sometimes known as the MIT License:
Copyright © 2012 Smallman12q Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The Software is provided "as is", without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the Software or the use or other dealings in the Software.
|
Download
[edit]Const oregonxml = "C:\Oregon.xml" 'XML file where data is stored
Dim html: Set html = CreateObject("htmlfile") 'MSHTML.HTMLDocument
Dim http: Set http = CreateObject("Microsoft.XMLHTTP") 'MSXML2.XMLHTTP.3.0
Dim xmldoc: Set xmldoc = CreateObject("Msxml2.DOMDocument")
'Write to file after every image
Const ForAppending = 8
Dim fs: Set fs = CreateObject("Scripting.FileSystemObject")
Dim file: Set file = fs.OpenTextFile(oregonxml, ForAppending, True,0) '0=ANSI, -1=Unicode
Dim counties: counties = Array("Baker", "Benton", "Clackamas", "Clatsop", "Columbia", "Coos", "Crook", "Curry", "Deschutes", "Douglas", "Gilliam", "Grant", _
"Harney", "Hood River", "Jackson", "Jefferson", "Josephine", "Klamath", "Lake", "Lane", "Lincoln", "Linn", "Malheur", "Marion", _
"Morrow", "Multnomah", "Polk", "Sherman", "Tillamook", "Umatilla", "Union", "Wallowa", "Wasco", "Washington", "Wheeler", "Yamhill")
Dim countycounter, countyname
'For Each county In counties
For countycounter = 0 To 1 'Change here to number of counties
Set html = Nothing
Set html = CreateObject("htmlfile") 'New instance
countyname = counties(countycounter)
countynamel = Replace(LCase(countyname)," ","")
'Dim county: Set county = xmldoc.createElement("county")
'xmldoc.appendChild(county)
file.WriteLine("<county id=""" & countyname & """>")
'''''''''
'Get number of images from index
http.open "GET", "http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/" & countynamel & "/index.html", False
http.send
html.write(http.responseText)
'Get number of images from h2 header: Benton County Scenic Images [88 images]
Dim h2 : h2= html.getElementsByTagName("h2")(0).innerHTML
Dim x1 : x1= InStr(h2,"[") + 1
Dim numberofimages : numberofimages = Mid(h2,x1, InStr(x1,h2," ") - x1)
'html.clear 'Not implemented http://msdn.microsoft.com/en-us/library/aa752567%28v=vs.85%29.aspx
html.open "about:blank" 'clears
'''''''
'Scrape and parse data for each image
Dim imagecounter 'Index starts at 1
For imagecounter = 1 To numberofimages
WScript.Echo imagecounter & "/" & numberofimages & " : " & countyname
Dim image: Set image = xmldoc.createElement("image")
xmldoc.appendChild(image)
'Load image site
http.open "GET", "http://arcweb.sos.state.or.us/pages/records/local/county/scenic/" & countynamel & "/" & imagecounter & ".html", False
http.send
html.write (http.responseText)
''''''
'Parse data
Dim x2 'Integer
Dim title: title = html.getElementsByTagName("p")(0).innerHTML
Dim desc: desc = html.getElementsByTagName("p")(3).innerHTML
x2 = InStr(desc,"(Photo No. ") + 11
Dim photonumber : photonumber = Mid(desc,x2, Len(desc) - x2)
desc = RTrim(Mid(desc,1,x2 - 12))
Dim source: source = html.getElementsByTagName("img")(2).src
x1=InStrRev(source,"/") + 1
source = Mid(source,x1)
source = "http://arcweb.sos.state.or.us/images/records/county/img/scenic/" &countynamel & "/" & source
'Parse out image details
Dim imagedetails: imagedetails = html.getElementsByTagName("p")(4).innerHTML
imagedetails = Mid(imagedetails,InStr(imagedetails,"<BR>") + 4)'Remove About this image
imagedetails = Split(imagedetails, ";", -1, 1)
'Lazy way of parsing out image details
Dim detail
Dim dimensions, datecreated, camera, lens, photographer, keywords
For Each detail In imagedetails
lz dimensions,"Dimensions: "
lz datecreated,"Date created: "
lz camera,"Camera: "
lz lens,"Lens: "
lz photographer,"Photographer citation: "
lz keywords,"Keywords/Tags: "
Next
'html.clear
html.open "about:blank"
''''''''
'Get image
http.open "GET", source, False
http.send
Dim b64 : Set b64 = xmldoc.createElement("imagedata")
With b64
.dataType = "bin.base64"
.nodeTypedValue = http.responseBody
End With
'''''''''
'Write out nodes
caddelement "title", title
caddelement "desc", desc
caddelement "number", imagecounter
caddelement "photonumber", photonumber
caddelement "source", source
caddelement "dimensions", dimensions
caddelement "date",datecreated
caddelement "camera", camera
caddelement "lens", lens
caddelement "photographer", photographer
caddelement "keywords", keywords
image.appendChild(b64)
''''''''''
'Append to file
file.WriteLine PrettyPrintXml (xmldoc)
xmldoc.removeChild(image)
WScript.Sleep 4000
Next
file.WriteLine("</county>")
Next
'Lazy split
Sub lz(ByRef tosplit, splitby)
Dim s: s=Split(detail,splitby)
If UBound(s) = 1 Then 'It has 2 items
tosplit = s(1)
End If
End Sub
Sub caddelement(elementname, elementvalue) 'image add element
Dim element: Set element = xmldoc.createElement(elementname)
element.text = elementvalue
'Dim value : Set value = xmldoc.createTextNode(elementvalue)
'element.appendChild(value) 'Set the element value
image.appendChild(element) 'Add element to county
End Sub
'http://stackoverflow.com/questions/4328907/add-child-entry-to-a-specific-node-in-xml-file-in-vbscipt
Function PrettyPrintXml(xmldoc)
Dim reader
set reader = CreateObject("Msxml2.SAXXMLReader.6.0")
Dim writer
set writer = CreateObject("Msxml2.MXXMLWriter.6.0")
writer.indent = True
writer.omitXMLDeclaration = True
reader.contentHandler = writer
reader.putProperty "http://xml.org/sax/properties/lexical-handler", writer
reader.parse(xmldoc)
PrettyPrintXml = writer.output
End Function
After the download is complete, prepend the xml file with "<counties>", and append it with "</counties>" to give it a root element.
Tag Name number
[edit]Javascript snippet for determining tag name/number/content. Run through developer tools in IE, should also work in firebug.
function tagnamenumber()
{
var w = window.open('', '', 'width=1000,height=500,resizable,scrollbars');
w.document.open();
w.document.write("<html><head><title>Viewing image...</title></head>");
w.document.write("<body onload = 'self.focus()'>");
var HTML4elements = ["A", "ABBR", "ACRONYM", "ADDRESS", "APPLET", "AREA", "B", "BASE", "BASEFONT", "BDO", "BIG", "BLOCKQUOTE", "BODY", "BR", "BUTTON", "CAPTION", "CENTER", "CITE", "CODE", "COL", "COLGROUP", "DD", "DEL", "DFN", "DIR", "DIV", "DL", "DT", "EM", "FIELDSET", "FONT", "FORM", "FRAME", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HR", "HTML", "I", "IFRAME", "IMG", "INPUT", "INS", "ISINDEX", "KBD", "LABEL", "LEGEND", "LI", "LINK", "MAP", "MENU", "META", "NOFRAMES", "NOSCRIPT", "OBJECT", "OL", "OPTGROUP", "OPTION", "P", "PARAM", "PRE", "Q", "S", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRIKE", "STRONG", "STYLE", "SUB", "SUP", "TABLE", "TBODY", "TD", "TEXTAREA", "TFOOT", "TH", "THEAD", "TITLE", "TR", "TT", "U", "UL", "VAR"];
//doesn't work piece
w.document.write("<p>");
for (var counter0 = 0; counter0 < 90; counter0+=1)
{
var element = HTML4elements[counter0];
w.document.write("<a href=\"#" + element + "\">" + element + "</a>, ");
}
w.document.write("</p></br></hr>");
//
for (var counter = 0; counter < 90; counter+=1)
{
var element = HTML4elements[counter];
var elements = document.getElementsByTagName(element);
if(elements.length > 0)
{
w.document.write("<h5>" + element + "</h5>");
//start table
w.document.write("<table border=\"1\" title=\"" + element + "\">");
w.document.write("<tr>");
w.document.write("<th>#</th>");
w.document.write("<th>innerHTML</th>");
w.document.write("<tr>");
//write rows
for (var elementscounter = 0; elementscounter < elements.length; elementscounter+=1)
{
w.document.write("<tr>");
w.document.write("<td name=\"" + elementscounter + "\">" + elementscounter + "</td>");
w.document.write("<td><pre><code>" + elements[elementscounter].innerHTML.replace(/</g,"<").replace(/>/g,">") +"</code></pre></td>");
w.document.write("</tr>");
}
//close table
w.document.write("</table>");
w.document.write("<br/>");
}
}
w.document.write("</body></html>");
w.document.close();
}
Link number
[edit]Provides link number for each link in page...works as above.
function linkorder()
{
var w = window.open('', '', 'width=1000,height=500,resizeable,scrollbars');
w.document.open();
w.document.write("<html><head><title>Links</title></head>");
w.document.write("<body onload = 'self.focus()'>");
var links = document.links;
w.document.write("<h5>Document.links</h5>");
//start table
w.document.write("<table border=\"1\" >");
w.document.write("<tr>");
w.document.write("<th>#</th>");
w.document.write("<th>innerHTML</th>");
w.document.write("<tr>");
//write rows
for(var counter = 0; counter < links.length; counter +=1)
{
w.document.write("<tr>");
w.document.write("<td>" + counter + "</td>");
w.document.write("<td>" + links[counter].href + "</td>");
w.document.write("</tr>");
}
//close table
w.document.write("</table>");
w.document.write("<br/>");
}
Upload
[edit]The source xml file can be found at dropbox (~220MB). One of the dates is wrongly (on their site) spelled as "September 2008".
Const user = "user"
Const pass = "password"
Dim http: Set http = CreateObject("Microsoft.XMLHTTP")
'''''''''''''''
'Can also use https
Dim Console: Set Console = WScript.StdOut
''''Login
Dim x 'node
Console.WriteLine "Logging in as :" & user
Console.WriteLine "Logging in... 1/2"
'initial post
wikipost "format=xml&action=login&lgname=" & Escape(user) & "&lgpassword=" & Escape(pass),0
nodeset "//api/login/@result"
If x Is Nothing Then
Console.WriteLine "Couldn't find initial login token."
Quit
End If
If x.value <> "NeedToken" Then
Console.WriteLine "Couldn't get initial login token."
Quit
End If
'Repost with token
Console.WriteLine "Logging in... 2/2"
nodeset "//api/login/@token"
wikipost "format=xml&action=login&lgname=" & Escape(user) & "&lgpassword=" & Escape(pass) & "&lgtoken=" & x.value,0
nodeset "//api/login/@result"
If x Is Nothing Then
WScript.Echo "Couldn't find login result."
Quit
End If
If x.value <> "Success" Then
Console.WriteLine "Login failed."
Quit
End If
Console.WriteLine "Successfully logged in"
'''Edit Token
Dim edittoken
'Get edittoken from main talkpage
Console.WriteLine "Retrieving edit token..."
wikipost "format=xml&action=query&prop=info&intoken=edit&titles=Talk:Main%20Page", 0
nodeset("//api/query/pages/page/@edittoken")
If x Is Nothing Then
Console.WriteLine "Couldn't find edittoken."
Quit
End If
If x.value = "+\" Then
Console.WriteLine "Invalid edittoken."
Quit
End If
edittoken = x.value
Console.WriteLine("Edit token retrieved: " & edittoken)
'Multipart variables
Dim boundary: boundary = "8G9lbpohjyr5ewco0ho" 'Should be more random
'''ADODB.Stream
Dim Stream : Set Stream = CreateObject("ADODB.Stream")
'Type
Const adTypeBinary = 1
Const adTypeText = 2
'State
Const adStateClosed = 0
Const adStateOpen = 1
'''''''''''''''''''''
'Actual upload
Dim xDoc: Set xDoc = CreateObject("Msxml2.DOMDocument")
xDoc.validateOnParse = True
Console.WriteLine "Loading xml file"
If xDoc.Load("C:\Oregon.xml") Then ' The document loaded successfully.
Console.WriteLine "XML file loaded"
Dim root: Set root = xDoc.childNodes
Dim counties : Set counties = root.item(0).childNodes
Console.WriteLine "There are " & counties.length & " counties."
Dim countycounter
For countycounter = 31 To counties.length - 1
Dim countyname : countyname = counties.item(countycounter).Attributes.item(0).text
Dim countynamefull : countynamefull = countyname & " County, Oregon"
'http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/josephine/index.html
editpage "Category: Scenic images of " & countynamefull, _
"{{en|1=Scenic images from the of [[w:" & countynamefull & "|" & countynamefull & "]] from the [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/menus/" & Replace(LCase(countyname)," ","") & "/index.html Oregon Historical County Records Guide]}}" &_
"[[Category: Images from Oregon Historical County Records Guide]]" & vbNewLine &_
"[[Category: " & countynamefull &"]]", _
"Creating category for per [[Commons:Bots/Requests/Smallbot 2]]"
Console.WriteLine "County: " & countyname & " ->" & countycounter
Dim images : Set images = counties.item(countycounter).childNodes
Dim imagecounter, imageslength
imageslength = images.length - 1
For imagecounter = 0 To imageslength
Console.WriteLine "Image: " & imagecounter & "/" & imageslength
Dim image: Set image = images.item(imagecounter).childNodes
'Get image data from xml
Dim title : title = image.item(0).text
Dim desc: desc = image.item(1).text
Dim number: number = image.item(2).text
Dim photonumber: photonumber = image.item(3).text
Dim source: source = image.item(4).text
Dim dimensions: dimensions = image.item(5).text
Dim vdate: vdate = image.item(6).text
Dim camera: camera = image.item(7).text
Dim lens: lens = image.item(8).text
Dim photographer: photographer = image.item(9).text
Dim keywords: keywords = image.item(10).text
keywords = Left(keywords, Len(keywords) - 1) 'Remove last .
'Dim imagedata: imagedata = image.item(11).nodeTypedValue 'In binary
'Generate template + category
Dim template: template = "{{Information " & vbNewLine &_
"|Description= {{en|1=" & desc & "}}" & vbNewLine &_
"|Date= " & Year(vdate) & "-" & month0(Month(vdate)) & vbNewLine &_
"|Source= {{en|1=[http://arcweb.sos.state.or.us/pages/records/local/county/scenic/index.html Oregon Historical County Records Guide]:" &_
" [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/" & Replace(LCase(countyname)," ","") & "/" &number & ".html Link]" &_
" [" & source & " Direct]}}" & vbNewLine &_
"|Author= {{en|1=" & photographer & "}}" & vbNewLine &_
"|Permission= {{Oregon Historical County Records Guide}}" & vbNewLine &_
"|Other_fields= " & vbNewLine &_
"{{Information field|name=Photo No. |value= " & photonumber & "}}" & vbNewLine &_
"{{Information field|name=Camera |value= " & camera & "}}" & vbNewLine &_
"{{Information field|name=Lens |value= " & lens & "}}" & vbNewLine &_
"{{Information field|name=Dimensions |value= [http://arcweb.sos.state.or.us/pages/records/local/county/scenic/buy.html " & dimensions & " Available for purchase from the Oregon State Archives.]}}" & vbNewLine &_
"{{Information field|name=Keywords |value= {{en|1=" & keywords & "}}}}" & vbNewLine &_
"}}" & vbNewLine &_
vbNewLine &_
"[[Category: Scenic images of " & countynamefull & "]]"
'Upload file
multipartpost image.item(11).nodeTypedValue, _
title & " (" & countynamefull & " scenic images" & ") (" & photonumber & ").jpg", _
template, _
"[[Commons:Bots/Requests/Smallbot 2]]: Uploading scenic images from Oregon State Archives"
'Assume success...else check response in fiddler2
'WScript.Quit
Next
Next
Else ' The document failed to load.
Console.WriteLine "XML failed to load:"
'http://msdn.microsoft.com/en-us/library/aa468547.aspx
Dim strErrText, xPe
Set xPe = xDoc.parseError
With xPe
strErrText = "Your XML Document failed to load" & _
"due the following error." & vbCrLf & _
"Error #: " & .errorCode & ": " & xPE.reason & _
"Line #: " & .Line & vbCrLf & _
"Line Position: " & .linepos & vbCrLf & _
"Position In File: " & .filepos & vbCrLf & _
"Source Text: " & .srcText & vbCrLf & _
"Document URL: " & .url
End With
Console.WriteLine strErrText
End If
Function month0 (input)
If input <10 Then
month0 = "0" & input
Else
month0 = input
End If
End Function
'''''''''''''''''''''''''''''''''''
Sub wikipost(payload, attempt)
http.open "POST","http://commons.wikimedia.org/w/api.php",False
http.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
http.send (payload)
If (httpstatus() = false) Then
If attempt < 3 Then
wikipost payload, attempt + 1
Else
Console.WriteLine "POST FAILED|" & payload
'Quit, HTTP errors
End If
End If
End Sub
'Returns false if not 200 response
Function httpstatus()
If http.status <> 200 Then
httpstatus = False
End If
httpstatus = True
End Function
Sub nodeset(node)
Set x= http.responseXML.selectSingleNode(node)
End Sub
Sub Quit
Console.WriteLine "Quit"
WScript.Quit
End Sub
Function editpage (page,text,summary)
wikipost "format=xml&action=edit&title=" & Encode(page) & "&text=" & Encode(text) & "&summary=" & Encode(summary) &"&bot=1"& "&token=" & Encode(edittoken), 0
nodeset "//api/edit/@result"
If x Is Nothing Then
'error
Console.WriteLine "Edit page error->nothing->" & page
Quit
End If
If x.value <> "Success" Then
'error
Console.WriteLine "Edit page failure." & page
Quit
End If
editpage = 1
End Function
Sub multipartpost(source, filename, desc, comment)
http.open "POST","http://commons.wikimedia.org/w/api.php",False
http.setRequestHeader "Content-Type", "multipart/form-data; boundary=" & boundary
Dim uploaddata
Dim params: params = Array("action","format","filename","text","comment","ignorewarnings","token")
Dim values: values = Array("upload","xml",filename,desc,comment,"1",edittoken)
Dim count
For count=0 To 6
uploaddata = uploaddata & vbNewLine & "--" & boundary & vbNewLine &_
"Content-Disposition: form-data; name=" & chr(34) & params(count) & chr(34) & vbNewLine &_
"Content-Type: text/plain" & vbNewLine &_
vbNewLine & values(count)
Next
'application/octet-stream
'image/jpg
http.Send ConcatByteArrays( _
ConcatByteArrays( _
StringToBinary(uploaddata & vbNewLine & "--" & boundary & vbNewLine &_
"Content-Disposition: form-data; name=" & chr(34) & "file" & chr(34) &"; filename=" & chr(34) & filename & chr(34) & vbNewLine &_
"Content-Type: application/octet-stream" & vbNewLine &_
"Content-Transfer-Encoding: binary" & vbNewLine &_
vbNewLine), _
source), _
StringToBinary(vbNewLine &_
"--" & boundary & "--" & vbNewLine))
End Sub
'http://stackoverflow.com/questions/184574/how-to-append-binary-values-in-vbscript
Function ConcatByteArrays(bytearray1, bytearray2)
ClearStream
'Open stream and write 1st, 2nd byte array
Stream.Open
Stream.Type = adTypeBinary 'Binary
Stream.Write bytearray1
Stream.Write bytearray2
Stream.Position = 0 'Reset position to read from start
ConcatByteArrays = Stream.Read
'Stream.Close
End Function
Sub ClearStream
If Stream.State <> adStateClosed Then
Stream.Close
End If
End Sub
'http://www.motobit.com/tips/detpg_binasp/
Function StringToBinary(Text)
ClearStream
'Set as text stream
Stream.Type = adTypeText
Stream.CharSet = "us-ascii" 'can change to others
'Write text to stream
Stream.Open
Stream.WriteText Text
'Change stream to binary
Stream.Position = 0 'Set position to 0 first
Stream.Type = adTypeBinary
'Open the stream as binary
StringToBinary = Stream.Read
'Stream.Close
End Function
'Very primitive escape
Function Encode(unencoded)
Encode= Replace(Escape(unencoded),"+","%2B")
End Function
'XMLDOM
'http://msdn.microsoft.com/en-us/library/windows/desktop/ms763798%28v=vs.85%29.aspx
Fiddler
[edit]In Fiddler 2 (requires .net 2), block .css/.js in the static function OnBeforeRequest
. Add
if (oSession.uriContains(".css")){
oSession["ui-color"]="orange";
oSession["ui-bold"]="true";
oSession.oRequest.FailSession(404, "Blocked", "Fiddler blocked CSS file");
}
if (oSession.uriContains(".js")){
oSession["ui-color"]="orange";
oSession["ui-bold"]="true";
oSession.oRequest.FailSession(404, "Blocked", "Fiddler blocked JS file");
}
More info at http://www.fiddler2.com/Fiddler/dev/ScriptSamples.asp
- Disable caching by checking
Rules->Performance->Disable caching
- Specify an appropriate bot UA (user agent) under
Rules->User Agents->Custom
- Move the google analytics .js request to the autoresponder if it isn't blocked already, enable automatic responses, and allow unmatched requests to pass through