Add third-party robots parser, including unit tests

This commit is contained in:
Tobi Schäfer 2025-05-10 21:44:19 +02:00 committed by Antville Git bot
parent f25b3c0b76
commit a7cabf0d63
2 changed files with 1394 additions and 0 deletions

491
code/Global/Robots.js Normal file
View file

@ -0,0 +1,491 @@
/**
* Trims the white space from the start and end of the line.
*
* If the line is an array it will strip the white space from
* the start and end of each element of the array.
*
* @param {string|Array} line
* @return {string|Array}
* @private
*/
function trimLine(line) {
if (!line) {
return null;
}
if (Array.isArray(line)) {
return line.map(trimLine);
}
return String(line).trim();
}
/**
* Remove comments from lines
*
* @param {string} line
* @return {string}
* @private
*/
function removeComments(line) {
var commentStartIndex = line.indexOf('#');
if (commentStartIndex > -1) {
return line.substr(0, commentStartIndex);
}
return line;
}
/**
* Splits a line at the first occurrence of :
*
* @param {string} line
* @return {Array.<string>}
* @private
*/
function splitLine(line) {
var idx = String(line).indexOf(':');
if (!line || idx < 0) {
return null;
}
return [line.slice(0, idx), line.slice(idx + 1)];
}
/**
* Normalises the user-agent string by converting it to
* lower case and removing any version numbers.
*
* @param {string} userAgent
* @return {string}
* @private
*/
function formatUserAgent(userAgent) {
var formattedUserAgent = userAgent.toLowerCase();
// Strip the version number from robot/1.0 user agents
var idx = formattedUserAgent.indexOf('/');
if (idx > -1) {
formattedUserAgent = formattedUserAgent.substr(0, idx);
}
return formattedUserAgent.trim();
}
/**
* Normalises the URL encoding of a path by encoding
* unicode characters.
*
* @param {string} path
* @return {string}
* @private
*/
function normaliseEncoding(path) {
try {
return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%'));
} catch (e) {
return path;
}
}
/**
* Convert URL encodings to support case.
*
* e.g.: %2a%ef becomes %2A%EF
*
* @param {string} path
* @return {string}
* @private
*/
function urlEncodeToUpper(path) {
return path.replace(/%[0-9a-fA-F]{2}/g, function (match) {
return match.toUpperCase();
});
}
/**
* Matches a pattern with the specified path
*
* Uses same algorithm to match patterns as the Google implementation in
* google/robotstxt so it should be consistent with the spec.
*
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
* @param {string} pattern
* @param {string} path
* @return {boolean}
* @private
*/
function matches(pattern, path) {
// I've added extra comments to try make this easier to understand
// Stores the lengths of all the current matching substrings.
// Maximum number of possible matching lengths is every length in path plus
// 1 to handle 0 length too (if pattern starts with * which is zero or more)
var matchingLengths = new Array(path.length + 1);
var numMatchingLengths = 1;
// Initially longest match is 0
matchingLengths[0] = 0;
for (var p = 0; p < pattern.length; p++) {
// If $ is at the end of pattern then we must match the whole path.
// Which is true if the longest matching length matches path length
if (pattern[p] === '$' && p + 1 === pattern.length) {
return matchingLengths[numMatchingLengths - 1] === path.length;
}
// Handle wildcards
if (pattern[p] == '*') {
// Wildcard so all substrings minus the current smallest matching
// length are matches
numMatchingLengths = path.length - matchingLengths[0] + 1;
// Update matching lengths to include the smallest all the way up
// to numMatchingLengths
// Don't update smallest possible match as * matches zero or more
// so the smallest current match is also valid
for (var i = 1; i < numMatchingLengths; i++) {
matchingLengths[i] = matchingLengths[i - 1] + 1;
}
} else {
// Check the char at the matching length matches the pattern, if it
// does increment it and add it as a valid length, ignore if not.
var numMatches = 0;
for (var i = 0; i < numMatchingLengths; i++) {
if (
matchingLengths[i] < path.length &&
path[matchingLengths[i]] === pattern[p]
) {
matchingLengths[numMatches++] = matchingLengths[i] + 1;
}
}
// No paths matched the current pattern char so not a match
if (numMatches == 0) {
return false;
}
numMatchingLengths = numMatches;
}
}
return true;
}
function parseRobots(contents, robots) {
var newlineRegex = /\r\n|\r|\n/;
var lines = contents
.split(newlineRegex)
.map(removeComments)
.map(splitLine)
.map(trimLine);
var currentUserAgents = [];
var isNoneUserAgentState = true;
for (var i = 0; i < lines.length; i++) {
var line = lines[i];
if (!line || !line[0]) {
continue;
}
switch (line[0].toLowerCase()) {
case 'user-agent':
if (isNoneUserAgentState) {
currentUserAgents.length = 0;
}
if (line[1]) {
currentUserAgents.push(formatUserAgent(line[1]));
}
break;
case 'disallow':
robots.addRule(currentUserAgents, line[1], false, i + 1);
break;
case 'allow':
robots.addRule(currentUserAgents, line[1], true, i + 1);
break;
case 'crawl-delay':
robots.setCrawlDelay(currentUserAgents, line[1]);
break;
case 'sitemap':
if (line[1]) {
robots.addSitemap(line[1]);
}
break;
case 'host':
if (line[1]) {
robots.setPreferredHost(line[1].toLowerCase());
}
break;
}
isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent';
}
}
/**
* Returns if a pattern is allowed by the specified rules.
*
* @param {string} path
* @param {Array.<Object>} rules
* @return {Object?}
* @private
*/
function findRule(path, rules) {
var matchedRule = null;
for (var i = 0; i < rules.length; i++) {
var rule = rules[i];
if (!matches(rule.pattern, path)) {
continue;
}
// The longest matching rule takes precedence
// If rules are the same length then allow takes precedence
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
matchedRule = rule;
} else if (
rule.pattern.length == matchedRule.pattern.length &&
rule.allow &&
!matchedRule.allow
) {
matchedRule = rule;
}
}
return matchedRule;
}
/**
* Converts provided string into an URL object.
*
* Will return null if provided string is not a valid URL.
*
* @param {string} url
* @return {?URL}
* @private
*/
function parseUrl(url) {
try {
// Specify a URL to be used with relative paths
// Using non-existent subdomain so can never cause conflict unless
// trying to crawl it but doesn't exist and even if tried worst that can
// happen is it allows relative URLs on it.
var url = new URL(url, 'http://robots-relative.samclarke.com/');
if (!url.port) {
url.port = url.protocol === 'https:' ? 443 : 80;
}
return url;
} catch (e) {
return null;
}
}
function Robots(url, contents) {
this._url = parseUrl(url) || {};
this._rules = Object.create(null);
this._sitemaps = [];
this._preferredHost = null;
parseRobots(contents || '', this);
}
/**
* Adds the specified allow/deny rule to the rules
* for the specified user-agents.
*
* @param {Array.<string>} userAgents
* @param {string} pattern
* @param {boolean} allow
* @param {number} [lineNumber] Should use 1-based indexing
*/
Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) {
var rules = this._rules;
userAgents.forEach(function (userAgent) {
rules[userAgent] = rules[userAgent] || [];
if (!pattern) {
return;
}
rules[userAgent].push({
pattern: normaliseEncoding(pattern),
allow: allow,
lineNumber: lineNumber
});
});
};
/**
* Adds the specified delay to the specified user agents.
*
* @param {Array.<string>} userAgents
* @param {string} delayStr
*/
Robots.prototype.setCrawlDelay = function (userAgents, delayStr) {
var rules = this._rules;
var delay = Number(delayStr);
userAgents.forEach(function (userAgent) {
rules[userAgent] = rules[userAgent] || [];
if (isNaN(delay)) {
return;
}
rules[userAgent].crawlDelay = delay;
});
};
/**
* Add a sitemap
*
* @param {string} url
*/
Robots.prototype.addSitemap = function (url) {
this._sitemaps.push(url);
};
/**
* Sets the preferred host name
*
* @param {string} url
*/
Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};
Robots.prototype._getRule = function (url, ua, explicit) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');
// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*'];
}
rules = rules || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);
return rule;
};
/**
* Returns true if allowed, false if not allowed.
*
* Will return undefined if the URL is not valid for
* this robots.txt file.
*
* @param {string} url
* @param {string?} ua
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua, false);
if (typeof rule === 'undefined') {
return;
}
return !rule || rule.allow;
};
/**
* Returns the line number of the matching directive for the specified
* URL and user-agent if any.
*
* The line numbers start at 1 and go up (1-based indexing).
*
* Return -1 if there is no matching directive. If a rule is manually
* added without a lineNumber then this will return undefined for that
* rule.
*
* @param {string} url
* @param {string?} ua
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua, false);
return rule ? rule.lineNumber : -1;
};
/**
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};
/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
*
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function (url, ua) {
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return;
}
return !(!rule || rule.allow);
};
/**
* Gets the crawl delay if there is one.
*
* Will return undefined if there is no crawl delay set.
*
* @param {string} ua
* @return {number?}
*/
Robots.prototype.getCrawlDelay = function (ua) {
var userAgent = formatUserAgent(ua || '*');
return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay;
};
/**
* Returns the preferred host if there is one.
*
* @return {string?}
*/
Robots.prototype.getPreferredHost = function () {
return this._preferredHost;
};
/**
* Returns an array of sitemap URLs if there are any.
*
* @return {Array.<string>}
*/
Robots.prototype.getSitemaps = function () {
return this._sitemaps.slice(0);
};
module.exports = Robots;

903
tests/robots.js Normal file
View file

@ -0,0 +1,903 @@
var robotsParser = require('../index');
var expect = require('chai').expect;
function testRobots(url, contents, allowed, disallowed) {
var robots = robotsParser(url, contents);
allowed.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(true);
});
disallowed.forEach(function (url) {
expect(robots.isDisallowed(url)).to.equal(true);
});
}
describe('Robots', function () {
it('should parse the disallow directive', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse the allow directive', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html',
'Allow: /fish/test.html',
'Allow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/fish/test.html',
'http://www.example.com/Test.html',
'http://www.example.com/test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse patterns', function () {
var contents = [
'User-agent: *',
'Disallow: /fish*.php',
'Disallow: /*.dext$',
'Disallow: /dir*'
].join('\n');
var allowed = [
'http://www.example.com/Fish.PHP',
'http://www.example.com/Fish.dext1',
'http://www.example.com/folder/dir.html',
'http://www.example.com/folder/dir/test.html'
];
var disallowed = [
'http://www.example.com/fish.php',
'http://www.example.com/fishheads/catfish.php?parameters',
'http://www.example.com/AnYthInG.dext',
'http://www.example.com/Fish.dext.dext',
'http://www.example.com/dir/test.html',
'http://www.example.com/directory.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should have the correct order precedence for allow and disallow', function () {
var contents = [
'User-agent: *',
'Disallow: /fish*.php',
'Allow: /fish/index.php',
'Disallow: /test',
'Allow: /test/',
'Disallow: /aa/',
'Allow: /aa/',
'Allow: /bb/',
'Disallow: /bb/',
].join('\n');
var allowed = [
'http://www.example.com/test/index.html',
'http://www.example.com/fish/index.php',
'http://www.example.com/test/',
'http://www.example.com/aa/',
'http://www.example.com/bb/',
'http://www.example.com/x/'
];
var disallowed = [
'http://www.example.com/fish.php',
'http://www.example.com/fishheads/catfish.php?parameters',
'http://www.example.com/test'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should have the correct order precedence for wildcards', function () {
var contents = [
'User-agent: *',
'Disallow: /*/',
'Allow: /x/',
].join('\n');
var allowed = [
'http://www.example.com/x/',
'http://www.example.com/fish.php',
'http://www.example.com/test'
];
var disallowed = [
'http://www.example.com/a/',
'http://www.example.com/xx/',
'http://www.example.com/test/index.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r\\n', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by mixed line endings', function () {
var contents = [
'User-agent: *\r',
'Disallow: /fish/\r\n',
'Disallow: /test.html\n\n'
].join('');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore rules that are not in a group', function () {
var contents = [
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.example.com/secret.html',
'http://www.example.com/test/index.html',
'http://www.example.com/test/'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, []);
});
it('should ignore comments', function () {
var contents = [
'#',
'# This is a comment',
'#',
'User-agent: *',
'# This is a comment',
'Disallow: /fish/ # ignore',
'# Disallow: fish',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore invalid lines', function () {
var contents = [
'invalid line',
'User-agent: *',
'Disallow: /fish/',
':::::another invalid line:::::',
'Disallow: /test.html',
'Unknown: tule'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore empty user-agent lines', function () {
var contents = [
'User-agent:',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html',
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
var disallowed = [];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should support groups with multiple user agents (case insensitive)', function () {
var contents = [
'User-agent: agenta',
'User-agent: agentb',
'Disallow: /fish',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false);
});
it('should return undefined for invalid urls', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var invalidUrls = [
'http://example.com/secret.html',
'http://ex ample.com/secret.html',
'http://www.example.net/test/index.html',
'http://www.examsple.com/test/',
'example.com/test/',
':::::;;`\\|/.example.com/test/'
];
var robots = robotsParser('http://www.example.com/robots.txt', contents);
invalidUrls.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(undefined);
});
});
it('should handle Unicode, urlencoded and punycode URLs', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.münich.com/index.html',
'http://www.xn--mnich-kva.com/index.html',
'http://www.m%C3%BCnich.com/index.html'
];
var disallowed = [
'http://www.münich.com/secret.html',
'http://www.xn--mnich-kva.com/secret.html',
'http://www.m%C3%BCnich.com/secret.html'
];
testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed);
testRobots('http://www.xn--mnich-kva.com/robots.txt', contents, allowed, disallowed);
testRobots('http://www.m%C3%BCnich.com/robots.txt', contents, allowed, disallowed);
});
it('should handle Unicode and urlencoded paths', function () {
var contents = [
'User-agent: *',
'Disallow: /%CF%80',
'Disallow: /%e2%9d%83',
'Disallow: /%a%a',
'Disallow: /💩',
'Disallow: /✼*t$',
'Disallow: /%E2%9C%A4*t$',
'Disallow: /✿%a',
'Disallow: /http%3A%2F%2Fexample.org'
].join('\n');
var allowed = [
'http://www.example.com/✼testing',
'http://www.example.com/%E2%9C%BCtesting',
'http://www.example.com/✤testing',
'http://www.example.com/%E2%9C%A4testing',
'http://www.example.com/http://example.org',
'http://www.example.com/http:%2F%2Fexample.org'
];
var disallowed = [
'http://www.example.com/%CF%80',
'http://www.example.com/%CF%80/index.html',
'http://www.example.com/π',
'http://www.example.com/π/index.html',
'http://www.example.com/%e2%9d%83',
'http://www.example.com/%E2%9D%83/index.html',
'http://www.example.com/❃',
'http://www.example.com/❃/index.html',
'http://www.example.com/%F0%9F%92%A9',
'http://www.example.com/%F0%9F%92%A9/index.html',
'http://www.example.com/💩',
'http://www.example.com/💩/index.html',
'http://www.example.com/%a%a',
'http://www.example.com/%a%a/index.html',
'http://www.example.com/✼test',
'http://www.example.com/%E2%9C%BCtest',
'http://www.example.com/✤test',
'http://www.example.com/%E2%9C%A4testt',
'http://www.example.com/✿%a',
'http://www.example.com/%E2%9C%BF%atest',
'http://www.example.com/http%3A%2F%2Fexample.org'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should handle lone high / low surrogates', function () {
var contents = [
'User-agent: *',
'Disallow: /\uD800',
'Disallow: /\uDC00'
].join('\n');
// These are invalid so can't be disallowed
var allowed = [
'http://www.example.com/\uDC00',
'http://www.example.com/\uD800'
];
var disallowed = [];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore host case', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.example.com/index.html',
'http://www.ExAmPlE.com/index.html',
'http://www.EXAMPLE.com/index.html'
];
var disallowed = [
'http://www.example.com/secret.html',
'http://www.ExAmPlE.com/secret.html',
'http://www.EXAMPLE.com/secret.html'
];
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
});
it('should handle relative paths', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(true);
expect(robots.isAllowed('/fish')).to.equal(false);
});
it('should not allow relative paths if domain specified', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
expect(robots.isAllowed('/fish')).to.equal(undefined);
});
it('should not treat invalid robots.txt URLs as relative', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('https://ex ample.com/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
expect(robots.isAllowed('/fish')).to.equal(undefined);
});
it('should not allow URls if domain specified and robots.txt is relative', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('/robots.txt', contents);
expect(robots.isAllowed('http://www.example.com/fish/test')).to.equal(undefined);
expect(robots.isAllowed('http://www.example.com/fish')).to.equal(undefined);
});
it('should allow all if empty robots.txt', function () {
var allowed = [
'http://www.example.com/secret.html',
'http://www.example.com/test/index.html',
'http://www.example.com/test/'
];
var robots = robotsParser('http://www.example.com/robots.txt', '');
allowed.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(true);
});
});
it('should treat null as allowing all', function () {
var robots = robotsParser('http://www.example.com/robots.txt', null);
expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true);
expect(robots.isAllowed("http://www.example.com/")).to.equal(true);
});
it('should handle invalid robots.txt urls', function () {
var contents = [
'user-agent: *',
'disallow: /',
'host: www.example.com',
'sitemap: /sitemap.xml'
].join('\n');
var sitemapUrls = [
undefined,
null,
'null',
':/wom/test/'
];
sitemapUrls.forEach(function (url) {
var robots = robotsParser(url, contents);
expect(robots.isAllowed('http://www.example.com/index.html')).to.equal(undefined);
expect(robots.getPreferredHost()).to.equal('www.example.com');
expect(robots.getSitemaps()).to.eql(['/sitemap.xml']);
});
});
it('should parse the crawl-delay directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'user-agent: b',
'disallow: /d',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('a')).to.equal(1);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
expect(robots.getCrawlDelay('c')).to.equal(10);
expect(robots.getCrawlDelay('d')).to.equal(10);
expect(robots.getCrawlDelay()).to.equal(undefined);
});
it('should ignore invalid crawl-delay directives', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1.2.1',
'user-agent: b',
'crawl-delay: 1.a0',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10a'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('a')).to.equal(undefined);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
expect(robots.getCrawlDelay('c')).to.equal(undefined);
expect(robots.getCrawlDelay('d')).to.equal(undefined);
});
it('should parse the sitemap directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'sitemap: http://example.com/test.xml',
'user-agent: b',
'disallow: /d',
'sitemap: /sitemap.xml',
'sitemap: http://example.com/test/sitemap.xml '
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getSitemaps()).to.eql([
'http://example.com/test.xml',
'/sitemap.xml',
'http://example.com/test/sitemap.xml'
]);
});
it('should parse the host directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'host: www.example.net',
'user-agent: b',
'disallow: /d',
'host: example.com'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal('example.com');
});
it('should parse empty and invalid directives', function () {
var contents = [
'user-agent:',
'user-agent:::: a::',
'crawl-delay:',
'crawl-delay:::: 0:',
'host:',
'host:: example.com',
'sitemap:',
'sitemap:: site:map.xml',
'disallow:',
'disallow::: /:',
'allow:',
'allow::: /:',
].join('\n');
robotsParser('http://www.example.com/robots.txt', contents);
});
it('should treat only the last host directive as valid', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'host: www.example.net',
'user-agent: b',
'disallow: /d',
'host: example.net',
'host: example.com'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal('example.com');
});
it('should return null when there is no host directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'user-agent: b',
'disallow: /d',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal(null);
});
it('should fallback to * when a UA has no rules of its own', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'crawl-delay: 12',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('should-fall-back')).to.equal(1);
expect(robots.getCrawlDelay('d')).to.equal(10);
expect(robots.getCrawlDelay('dd')).to.equal(1);
});
it('should not fallback to * when a UA has rules', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'disallow:'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
});
it('should handle UAs with object property names', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed('http://www.example.com/fish', 'constructor')).to.equal(false);
expect(robots.isAllowed('http://www.example.com/fish', '__proto__')).to.equal(false);
});
it('should ignore version numbers in the UA string', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'crawl-delay: 12',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('should-fall-back/1.0.0')).to.equal(1);
expect(robots.getCrawlDelay('d/12')).to.equal(10);
expect(robots.getCrawlDelay('dd / 0-32-3')).to.equal(1);
expect(robots.getCrawlDelay('b / 1.0')).to.equal(12);
});
it('should return the line number of the matching directive', function () {
var contents = [
'',
'User-agent: *',
'',
'Disallow: /fish/',
'Disallow: /test.html',
'Allow: /fish/test.html',
'Allow: /test.html',
'',
'User-agent: a',
'allow: /',
'',
'User-agent: b',
'disallow: /test',
'disallow: /t*t',
'',
'User-agent: c',
'Disallow: /fish*.php',
'Allow: /fish/index.php'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getMatchingLineNumber('http://www.example.com/fish')).to.equal(-1);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/test.html')).to.equal(6);
expect(robots.getMatchingLineNumber('http://www.example.com/Test.html')).to.equal(-1);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php')).to.equal(4);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10);
expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18);
});
it('should handle large wildcards efficiently', function () {
var contents = [
'User-agent: *',
'Disallow: /' + '*'.repeat(2048) + '.html',
].join('\n');
var allowed = [
'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php',
];
var disallowed = [
'http://www.example.com/secret.html'
];
const start = Date.now();
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
const end = Date.now();
// Should take less than 500 ms (high to allow for variableness of
// machines running the test, should normally be much less)
expect(end - start).to.be.lessThan(500);
});
it('should honor given port number', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com:8080/fish',
'http://www.example.com:8080/Test.html'
];
var disallowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html',
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html'
];
testRobots('http://www.example.com:8080/robots.txt', contents, allowed, disallowed);
});
it('should default to port 80 for http: if no port given', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html'
];
var disallowed = [
'http://www.example.com:443/fish',
'http://www.example.com:443/Test.html',
'http://www.example.com:80/fish/index.php',
'http://www.example.com:80/fish/',
'http://www.example.com:80/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should default to port 443 for https: if no port given', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'https://www.example.com:443/fish',
'https://www.example.com:443/Test.html',
'https://www.example.com/fish',
'https://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html',
'http://www.example.com:443/fish/index.php',
'http://www.example.com:443/fish/',
'http://www.example.com:443/test.html'
];
testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
});
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
});
it('should return undefined when given an invalid URL in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser('http://example.com', contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined)
});
});