MediaWiki:LibFileAnalyzer.js

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search
Note: After saving, you have to bypass your browser's cache to see the changes. Internet Explorer: press Ctrl-F5, Mozilla: hold down Shift while clicking Reload (or press Ctrl-Shift-R), Opera/Konqueror: press F5, Safari: hold down Shift + Alt while clicking Reload, Chrome: hold down Shift while clicking Reload.
/**
 * [[MediaWiki:LibFileAnalyzer.js]]
 *
 * Analyzes files scanning for patterns.
 * Optimized for High-performance-screening.
 *
 * Written for running inside a webworker.
 */

/*
 * Copyright (C) 2013 Rainer Rillke and others
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

/// ATTENTION! JQUERY AND MEDIAWIKI VARIABLES NOT AVAILABLE!
/*jshint worker:true, bitwise:false*/
/*global FileReaderSync:false, CryptoJS:false*/

(function (w) {
	'use strict';

	var isOfType = function (v, t) {
			return Object.prototype.toString.call(v) === '[object ' + t + ']';
		},
		isArray = function (a) {
			return Object.prototype.toString.call(a) === '[object Array]';
		},
		isRegExp = function (r) {
			return Object.prototype.toString.call(r) === '[object RegExp]';
		};


	/**
	 * Represents the term or pattern to match
	 *
	 * @param {string|Array|RegExp} search term; Array: of unsigned bytes (0-255); String: A standard UTF-8 string
	 */
	function ScriptDetectorSearch(term) {
		this.term = term;
		if (isRegExp(term)) {
			throw new Error("RegExp for ScriptDetectorSearch not implemented, yet.");
			//this.isRegExp = true;
		} else if (isArray(term)) {
			// Already done!
		} else {
			this.term = strToUTF8Arr(term);
		}
		this.termLastIndex = term.length - 1;
		this.matches = [];
		this.possibleMatches = [];
	}
	ScriptDetectorSearch.prototype.addByte = function (b, pos) {
		var pm = this.possibleMatches,
			t = this.term,
			toNuke = [],
			lp, os, m, i, pml, tnl;

		// Start a new match?
		if (t[0] === b) {
			this.possibleMatches.push({
				posLast: pos - 1,
				offset: -1
			});
		}

		// Loop over all possible matches
		for (i = 0, pml = pm.length; i < pml; ++i) {
			m = pm[i];
			lp = ++m.posLast;
			os = ++m.offset;

			// Is it the next char?
			if (lp === pos) {
			
				// Does this char match?
				if (t[os] === b) {
					// YES!
					// Check whether we are complete
					if (this.termLastIndex === os) {
						
						// COMPLETE.
						// nuke myself
						pm[i] = undefined;
						toNuke.push(i);
						// add to me results
						this.matches.push({
							at: pos - this.termLastIndex,
							len: this.termLastIndex + 1
						});
					}
				} else {
					// Character did not match
					// I have to go
					pm[i] = undefined;
					toNuke.push(i);
				}
			} else {
				// Nothing did match.
				pm[i] = undefined;
				toNuke.push(i);
			}
		}

		for (i = toNuke.length; i > 0; --i) {
			pm.splice(toNuke[i-1], 1);
		}
	};
	// Returns a mapOfInterest
	ScriptDetectorSearch.prototype.getRelevantChars = function () {
		if (this.isRegExp) throw new Error("You must not ask me for relevant chars if I am a regular expression." +
			"Oops, I forgot to mention that, I am an instance of ScriptDetectorSearch.");

		var term = this.term,
			mor = new Array(256); // Map chars relevant for this term

		for (var i = 0, l = term.length; i < l; ++i) {
			mor[term[i] & 0xFF] = true;
		}

		return mor;
	};
	w.ScriptDetectorSearch = w.ScriptDetectorSearch || ScriptDetectorSearch;


	/**
	 * 
	 *
	 * @param string mime the mime type of the file
	 * @param string extension the extension of the file
	 * @return boolean true if the file contains something looking like embedded scripts
	 */
	function ScriptDetector(mime, extension) {
		this.mime = mime;
		this.extension = extension;
		this.mapOfInterest = new Array(255);
		for (var i = 0; i < 256; ++i) {
			this.mapOfInterest[i] = [];
		}
	}
	w.ScriptDetector = w.ScriptDetector || ScriptDetector;

	/**
	 *
	 * @param {ScriptDetectorSearch} a search that should be looked for while the buffer is read
	 */
	ScriptDetector.prototype.addSearch = function (s) {
		var mor = s.getRelevantChars(),
			moi = this.mapOfInterest;

		for (var i = 0; i < 256; ++i) {
			if (mor[i]) moi[i].push(s);
		}
	};

	ScriptDetector.prototype.caseMap = (function () {
		var map = new Uint8Array(256);

		for (var i = 0; i < 256; ++i) {
			if (i > 64 && i < 91) {
				map[i] = i + 32;
			} else {
				map[i] = i;
			}
		}
		return map;
	}());

	ScriptDetector.prototype.addChunk = function (arrBuff) {
		// First, convert the ArrayBuffer into a Uint8Array.
		// This is 4 times faster in Firefox compared to
		// iterating over the buffer using getUint8()
		var data = new Uint8Array(arrBuff),
			l = data.length,
			cm = this.caseMap, // Load in local function scope to speed-up access
			moi = this.mapOfInterest,
			b, i, x, ipl, interestedParties;

		// ++i is slightly faster compared to i++
		for (i = 0; i < l; ++i) {
			// ASCII to lower case
			// Accessors are quite fast, so we are going to use a map.
			// TODO: Check how PHP treats UTF-8 and whether we do it correctly here
			b = cm[data[i]];

			interestedParties = moi[b];
			for (x = 0, ipl = interestedParties.length; x < ipl; ++x) {
				interestedParties[x].addByte(b, i);
			}
		}
	};


	/*\
	|*|
	|*|  Base64 / binary data / UTF-8 strings utilities
	|*|
	|*|  https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding
	|*|
	\*/
	function strToUTF8Arr(sDOMStr) {

		var aBytes, nChr, nStrLen = sDOMStr.length,
			nArrLen = 0;

		/* mapping... */

		for (var nMapIdx = 0; nMapIdx < nStrLen; nMapIdx++) {
			nChr = sDOMStr.charCodeAt(nMapIdx);
			nArrLen += nChr < 0x80 ? 1 : nChr < 0x800 ? 2 : nChr < 0x10000 ? 3 : nChr < 0x200000 ? 4 : nChr < 0x4000000 ? 5 : 6;
		}

		aBytes = new Uint8Array(nArrLen);

		/* transcription... */

		for (var nIdx = 0, nChrIdx = 0; nIdx < nArrLen; nChrIdx++) {
			nChr = sDOMStr.charCodeAt(nChrIdx);
			if (nChr < 128) {
				/* one byte */
				aBytes[nIdx++] = nChr;
			} else if (nChr < 0x800) {
				/* two bytes */
				aBytes[nIdx++] = 192 + (nChr >>> 6);
				aBytes[nIdx++] = 128 + (nChr & 63);
			} else if (nChr < 0x10000) {
				/* three bytes */
				aBytes[nIdx++] = 224 + (nChr >>> 12);
				aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
				aBytes[nIdx++] = 128 + (nChr & 63);
			} else if (nChr < 0x200000) {
				/* four bytes */
				aBytes[nIdx++] = 240 + (nChr >>> 18);
				aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
				aBytes[nIdx++] = 128 + (nChr & 63);
			} else if (nChr < 0x4000000) {
				/* five bytes */
				aBytes[nIdx++] = 248 + (nChr >>> 24);
				aBytes[nIdx++] = 128 + (nChr >>> 18 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
				aBytes[nIdx++] = 128 + (nChr & 63);
			} else /* if (nChr <= 0x7fffffff) */ {
				/* six bytes */
				aBytes[nIdx++] = 252 + /* (nChr >>> 32) is not possible in ECMAScript! So...: */ (nChr / 1073741824);
				aBytes[nIdx++] = 128 + (nChr >>> 24 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 18 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
				aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
				aBytes[nIdx++] = 128 + (nChr & 63);
			}
		}

		return aBytes;

	}
	w.strToUTF8Arr = w.strToUTF8Arr || strToUTF8Arr;
}(self));