MediaWiki:LibFileAnalyzer.js
Jump to navigation
Jump to search
Note: After saving, you have to bypass your browser's cache to see the changes. Internet Explorer: press Ctrl-F5, Mozilla: hold down Shift while clicking Reload (or press Ctrl-Shift-R), Opera/Konqueror: press F5, Safari: hold down Shift + Alt while clicking Reload, Chrome: hold down Shift while clicking Reload.
Documentation for this user script can be added at MediaWiki:LibFileAnalyzer. |
/**
* [[MediaWiki:LibFileAnalyzer.js]]
*
* Analyzes files scanning for patterns.
* Optimized for High-performance-screening.
*
* Written for running inside a webworker.
*/
/*
* Copyright (C) 2013 Rainer Rillke and others
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
/// ATTENTION! JQUERY AND MEDIAWIKI VARIABLES NOT AVAILABLE!
/*jshint worker:true, bitwise:false*/
/*global FileReaderSync:false, CryptoJS:false*/
(function (w) {
'use strict';
var isOfType = function (v, t) {
return Object.prototype.toString.call(v) === '[object ' + t + ']';
},
isArray = function (a) {
return Object.prototype.toString.call(a) === '[object Array]';
},
isRegExp = function (r) {
return Object.prototype.toString.call(r) === '[object RegExp]';
};
/**
* Represents the term or pattern to match
*
* @param {string|Array|RegExp} search term; Array: of unsigned bytes (0-255); String: A standard UTF-8 string
*/
function ScriptDetectorSearch(term) {
this.term = term;
if (isRegExp(term)) {
throw new Error("RegExp for ScriptDetectorSearch not implemented, yet.");
//this.isRegExp = true;
} else if (isArray(term)) {
// Already done!
} else {
this.term = strToUTF8Arr(term);
}
this.termLastIndex = term.length - 1;
this.matches = [];
this.possibleMatches = [];
}
ScriptDetectorSearch.prototype.addByte = function (b, pos) {
var pm = this.possibleMatches,
t = this.term,
toNuke = [],
lp, os, m, i, pml, tnl;
// Start a new match?
if (t[0] === b) {
this.possibleMatches.push({
posLast: pos - 1,
offset: -1
});
}
// Loop over all possible matches
for (i = 0, pml = pm.length; i < pml; ++i) {
m = pm[i];
lp = ++m.posLast;
os = ++m.offset;
// Is it the next char?
if (lp === pos) {
// Does this char match?
if (t[os] === b) {
// YES!
// Check whether we are complete
if (this.termLastIndex === os) {
// COMPLETE.
// nuke myself
pm[i] = undefined;
toNuke.push(i);
// add to me results
this.matches.push({
at: pos - this.termLastIndex,
len: this.termLastIndex + 1
});
}
} else {
// Character did not match
// I have to go
pm[i] = undefined;
toNuke.push(i);
}
} else {
// Nothing did match.
pm[i] = undefined;
toNuke.push(i);
}
}
for (i = toNuke.length; i > 0; --i) {
pm.splice(toNuke[i-1], 1);
}
};
// Returns a mapOfInterest
ScriptDetectorSearch.prototype.getRelevantChars = function () {
if (this.isRegExp) throw new Error("You must not ask me for relevant chars if I am a regular expression." +
"Oops, I forgot to mention that, I am an instance of ScriptDetectorSearch.");
var term = this.term,
mor = new Array(256); // Map chars relevant for this term
for (var i = 0, l = term.length; i < l; ++i) {
mor[term[i] & 0xFF] = true;
}
return mor;
};
w.ScriptDetectorSearch = w.ScriptDetectorSearch || ScriptDetectorSearch;
/**
*
*
* @param string mime the mime type of the file
* @param string extension the extension of the file
* @return boolean true if the file contains something looking like embedded scripts
*/
function ScriptDetector(mime, extension) {
this.mime = mime;
this.extension = extension;
this.mapOfInterest = new Array(255);
for (var i = 0; i < 256; ++i) {
this.mapOfInterest[i] = [];
}
}
w.ScriptDetector = w.ScriptDetector || ScriptDetector;
/**
*
* @param {ScriptDetectorSearch} a search that should be looked for while the buffer is read
*/
ScriptDetector.prototype.addSearch = function (s) {
var mor = s.getRelevantChars(),
moi = this.mapOfInterest;
for (var i = 0; i < 256; ++i) {
if (mor[i]) moi[i].push(s);
}
};
ScriptDetector.prototype.caseMap = (function () {
var map = new Uint8Array(256);
for (var i = 0; i < 256; ++i) {
if (i > 64 && i < 91) {
map[i] = i + 32;
} else {
map[i] = i;
}
}
return map;
}());
ScriptDetector.prototype.addChunk = function (arrBuff) {
// First, convert the ArrayBuffer into a Uint8Array.
// This is 4 times faster in Firefox compared to
// iterating over the buffer using getUint8()
var data = new Uint8Array(arrBuff),
l = data.length,
cm = this.caseMap, // Load in local function scope to speed-up access
moi = this.mapOfInterest,
b, i, x, ipl, interestedParties;
// ++i is slightly faster compared to i++
for (i = 0; i < l; ++i) {
// ASCII to lower case
// Accessors are quite fast, so we are going to use a map.
// TODO: Check how PHP treats UTF-8 and whether we do it correctly here
b = cm[data[i]];
interestedParties = moi[b];
for (x = 0, ipl = interestedParties.length; x < ipl; ++x) {
interestedParties[x].addByte(b, i);
}
}
};
/*\
|*|
|*| Base64 / binary data / UTF-8 strings utilities
|*|
|*| https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding
|*|
\*/
function strToUTF8Arr(sDOMStr) {
var aBytes, nChr, nStrLen = sDOMStr.length,
nArrLen = 0;
/* mapping... */
for (var nMapIdx = 0; nMapIdx < nStrLen; nMapIdx++) {
nChr = sDOMStr.charCodeAt(nMapIdx);
nArrLen += nChr < 0x80 ? 1 : nChr < 0x800 ? 2 : nChr < 0x10000 ? 3 : nChr < 0x200000 ? 4 : nChr < 0x4000000 ? 5 : 6;
}
aBytes = new Uint8Array(nArrLen);
/* transcription... */
for (var nIdx = 0, nChrIdx = 0; nIdx < nArrLen; nChrIdx++) {
nChr = sDOMStr.charCodeAt(nChrIdx);
if (nChr < 128) {
/* one byte */
aBytes[nIdx++] = nChr;
} else if (nChr < 0x800) {
/* two bytes */
aBytes[nIdx++] = 192 + (nChr >>> 6);
aBytes[nIdx++] = 128 + (nChr & 63);
} else if (nChr < 0x10000) {
/* three bytes */
aBytes[nIdx++] = 224 + (nChr >>> 12);
aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
aBytes[nIdx++] = 128 + (nChr & 63);
} else if (nChr < 0x200000) {
/* four bytes */
aBytes[nIdx++] = 240 + (nChr >>> 18);
aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
aBytes[nIdx++] = 128 + (nChr & 63);
} else if (nChr < 0x4000000) {
/* five bytes */
aBytes[nIdx++] = 248 + (nChr >>> 24);
aBytes[nIdx++] = 128 + (nChr >>> 18 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
aBytes[nIdx++] = 128 + (nChr & 63);
} else /* if (nChr <= 0x7fffffff) */ {
/* six bytes */
aBytes[nIdx++] = 252 + /* (nChr >>> 32) is not possible in ECMAScript! So...: */ (nChr / 1073741824);
aBytes[nIdx++] = 128 + (nChr >>> 24 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 18 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 12 & 63);
aBytes[nIdx++] = 128 + (nChr >>> 6 & 63);
aBytes[nIdx++] = 128 + (nChr & 63);
}
}
return aBytes;
}
w.strToUTF8Arr = w.strToUTF8Arr || strToUTF8Arr;
}(self));