From 362ca05ab88589eaad9116f1a8ed03b24f81cbee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20Sch=C3=A4fer?= Date: Sat, 10 May 2025 21:58:07 +0200 Subject: [PATCH] Adapt robots parser and its tests for Rhino --- code/Global/Robots.js | 1028 ++++++++++++----------- tests/robots.js | 1797 +++++++++++++++++++++-------------------- 2 files changed, 1443 insertions(+), 1382 deletions(-) diff --git a/code/Global/Robots.js b/code/Global/Robots.js index b9835e0c..04c69d67 100644 --- a/code/Global/Robots.js +++ b/code/Global/Robots.js @@ -1,491 +1,543 @@ -/** - * Trims the white space from the start and end of the line. - * - * If the line is an array it will strip the white space from - * the start and end of each element of the array. - * - * @param {string|Array} line - * @return {string|Array} - * @private - */ -function trimLine(line) { - if (!line) { - return null; - } +// Robots parser adapted for Rhino-compatible JavaScript +// Source: +// Copyright (c) 2014 Sam Clarke +// Copyright (c) 2025 Antville.org +// MIT License (MIT) - if (Array.isArray(line)) { - return line.map(trimLine); - } +// Transformation steps: +// 1. Add IIFE around the code +// 2. Replace module.exports with return statement +// 3. Add conditional module.exports for CommonJS support +// 4. Add URL class imitation - return String(line).trim(); +var Robots = (() => { + /** + * Half-baked (read-only) imitation of the URL class of Node.js + */ + function nodeJsUrl(str, base) { + if (!str.includes('://')) { + str = (base || 'http://localhost') + str; + } + + const url = new java.net.URL(str); + const port = url.port < 0 ? '' : url.port; + const userInfo = (url.getUserInfo() || "").split(':'); + + return { + hash: url.ref ? '#' + url.ref : '', + href: url.toString(), + host: url.host + (port ? ':' + port : port), + hostname: url.host, + password: userInfo[1] || "", + pathname: url.path, + origin: url.protocol + '://' + url.host + (port ? ':' + port : port), + port, + protocol: url.protocol, + search: url.queryy ? '?' + url.query : '', + searchParams: { + get: () => null, + set: () => null + }, + username: userInfo[0] || "", + }; + } + + if (typeof URL === 'undefined') { + globalThis.URL = nodeJsUrl; + } + + /** + * Trims the white space from the start and end of the line. + * + * If the line is an array it will strip the white space from + * the start and end of each element of the array. + * + * @param {string|Array} line + * @return {string|Array} + * @private + */ + function trimLine(line) { + if (!line) { + return null; + } + + if (Array.isArray(line)) { + return line.map(trimLine); + } + + return String(line).trim(); + } + + /** + * Remove comments from lines + * + * @param {string} line + * @return {string} + * @private + */ + function removeComments(line) { + var commentStartIndex = line.indexOf('#'); + if (commentStartIndex > -1) { + return line.substr(0, commentStartIndex); + } + + return line; + } + + /** + * Splits a line at the first occurrence of : + * + * @param {string} line + * @return {Array.} + * @private + */ + function splitLine(line) { + var idx = String(line).indexOf(':'); + + if (!line || idx < 0) { + return null; + } + + return [line.slice(0, idx), line.slice(idx + 1)]; + } + + /** + * Normalises the user-agent string by converting it to + * lower case and removing any version numbers. + * + * @param {string} userAgent + * @return {string} + * @private + */ + function formatUserAgent(userAgent) { + var formattedUserAgent = userAgent.toLowerCase(); + + // Strip the version number from robot/1.0 user agents + var idx = formattedUserAgent.indexOf('/'); + if (idx > -1) { + formattedUserAgent = formattedUserAgent.substr(0, idx); + } + + return formattedUserAgent.trim(); + } + + /** + * Normalises the URL encoding of a path by encoding + * unicode characters. + * + * @param {string} path + * @return {string} + * @private + */ + function normaliseEncoding(path) { + try { + return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%')); + } catch (e) { + return path; + } + } + + /** + * Convert URL encodings to support case. + * + * e.g.: %2a%ef becomes %2A%EF + * + * @param {string} path + * @return {string} + * @private + */ + function urlEncodeToUpper(path) { + return path.replace(/%[0-9a-fA-F]{2}/g, function (match) { + return match.toUpperCase(); + }); + } + + /** + * Matches a pattern with the specified path + * + * Uses same algorithm to match patterns as the Google implementation in + * google/robotstxt so it should be consistent with the spec. + * + * @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74 + * @param {string} pattern + * @param {string} path + * @return {boolean} + * @private + */ + function matches(pattern, path) { + // I've added extra comments to try make this easier to understand + + // Stores the lengths of all the current matching substrings. + // Maximum number of possible matching lengths is every length in path plus + // 1 to handle 0 length too (if pattern starts with * which is zero or more) + var matchingLengths = new Array(path.length + 1); + var numMatchingLengths = 1; + + // Initially longest match is 0 + matchingLengths[0] = 0; + + for (var p = 0; p < pattern.length; p++) { + // If $ is at the end of pattern then we must match the whole path. + // Which is true if the longest matching length matches path length + if (pattern[p] === '$' && p + 1 === pattern.length) { + return matchingLengths[numMatchingLengths - 1] === path.length; + } + + // Handle wildcards + if (pattern[p] == '*') { + // Wildcard so all substrings minus the current smallest matching + // length are matches + numMatchingLengths = path.length - matchingLengths[0] + 1; + + // Update matching lengths to include the smallest all the way up + // to numMatchingLengths + // Don't update smallest possible match as * matches zero or more + // so the smallest current match is also valid + for (var i = 1; i < numMatchingLengths; i++) { + matchingLengths[i] = matchingLengths[i - 1] + 1; + } + } else { + // Check the char at the matching length matches the pattern, if it + // does increment it and add it as a valid length, ignore if not. + var numMatches = 0; + for (var i = 0; i < numMatchingLengths; i++) { + if ( + matchingLengths[i] < path.length && + path[matchingLengths[i]] === pattern[p] + ) { + matchingLengths[numMatches++] = matchingLengths[i] + 1; + } + } + + // No paths matched the current pattern char so not a match + if (numMatches == 0) { + return false; + } + + numMatchingLengths = numMatches; + } + } + + return true; + } + + function parseRobots(contents, robots) { + var newlineRegex = /\r\n|\r|\n/; + var lines = contents + .split(newlineRegex) + .map(removeComments) + .map(splitLine) + .map(trimLine); + + var currentUserAgents = []; + var isNoneUserAgentState = true; + for (var i = 0; i < lines.length; i++) { + var line = lines[i]; + + if (!line || !line[0]) { + continue; + } + + switch (line[0].toLowerCase()) { + case 'user-agent': + if (isNoneUserAgentState) { + currentUserAgents.length = 0; + } + + if (line[1]) { + currentUserAgents.push(formatUserAgent(line[1])); + } + break; + case 'disallow': + robots.addRule(currentUserAgents, line[1], false, i + 1); + break; + case 'allow': + robots.addRule(currentUserAgents, line[1], true, i + 1); + break; + case 'crawl-delay': + robots.setCrawlDelay(currentUserAgents, line[1]); + break; + case 'sitemap': + if (line[1]) { + robots.addSitemap(line[1]); + } + break; + case 'host': + if (line[1]) { + robots.setPreferredHost(line[1].toLowerCase()); + } + break; + } + + isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent'; + } + } + + /** + * Returns if a pattern is allowed by the specified rules. + * + * @param {string} path + * @param {Array.} rules + * @return {Object?} + * @private + */ + function findRule(path, rules) { + var matchedRule = null; + + for (var i = 0; i < rules.length; i++) { + var rule = rules[i]; + + if (!matches(rule.pattern, path)) { + continue; + } + + // The longest matching rule takes precedence + // If rules are the same length then allow takes precedence + if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) { + matchedRule = rule; + } else if ( + rule.pattern.length == matchedRule.pattern.length && + rule.allow && + !matchedRule.allow + ) { + matchedRule = rule; + } + } + + return matchedRule; + } + + /** + * Converts provided string into an URL object. + * + * Will return null if provided string is not a valid URL. + * + * @param {string} url + * @return {?URL} + * @private + */ + function parseUrl(url) { + try { + // Specify a URL to be used with relative paths + // Using non-existent subdomain so can never cause conflict unless + // trying to crawl it but doesn't exist and even if tried worst that can + // happen is it allows relative URLs on it. + var url = new URL(url, 'http://robots-relative.samclarke.com/'); + + if (!url.port) { + url.port = url.protocol === 'https:' ? 443 : 80; + } + + return url; + } catch (e) { + return null; + } + } + + function Robots(url, contents) { + this._url = parseUrl(url) || {}; + this._rules = Object.create(null); + this._sitemaps = []; + this._preferredHost = null; + + parseRobots(contents || '', this); + } + + /** + * Adds the specified allow/deny rule to the rules + * for the specified user-agents. + * + * @param {Array.} userAgents + * @param {string} pattern + * @param {boolean} allow + * @param {number} [lineNumber] Should use 1-based indexing + */ + Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) { + var rules = this._rules; + + userAgents.forEach(function (userAgent) { + rules[userAgent] = rules[userAgent] || []; + + if (!pattern) { + return; + } + + rules[userAgent].push({ + pattern: normaliseEncoding(pattern), + allow: allow, + lineNumber: lineNumber + }); + }); + }; + + /** + * Adds the specified delay to the specified user agents. + * + * @param {Array.} userAgents + * @param {string} delayStr + */ + Robots.prototype.setCrawlDelay = function (userAgents, delayStr) { + var rules = this._rules; + var delay = Number(delayStr); + + userAgents.forEach(function (userAgent) { + rules[userAgent] = rules[userAgent] || []; + + if (isNaN(delay)) { + return; + } + + rules[userAgent].crawlDelay = delay; + }); + }; + + /** + * Add a sitemap + * + * @param {string} url + */ + Robots.prototype.addSitemap = function (url) { + this._sitemaps.push(url); + }; + + /** + * Sets the preferred host name + * + * @param {string} url + */ + Robots.prototype.setPreferredHost = function (url) { + this._preferredHost = url; + }; + + Robots.prototype._getRule = function (url, ua, explicit) { + var parsedUrl = parseUrl(url) || {}; + var userAgent = formatUserAgent(ua || '*'); + + // The base URL must match otherwise this robots.txt is not valid for it. + if ( + parsedUrl.protocol !== this._url.protocol || + parsedUrl.hostname !== this._url.hostname || + parsedUrl.port !== this._url.port + ) { + return; + } + + var rules = this._rules[userAgent]; + if (!explicit) { + rules = rules || this._rules['*']; + } + rules = rules || []; + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); + var rule = findRule(path, rules); + + return rule; + }; + + /** + * Returns true if allowed, false if not allowed. + * + * Will return undefined if the URL is not valid for + * this robots.txt file. + * + * @param {string} url + * @param {string?} ua + * @return {boolean?} + */ + Robots.prototype.isAllowed = function (url, ua) { + var rule = this._getRule(url, ua, false); + + if (typeof rule === 'undefined') { + return; + } + + return !rule || rule.allow; + }; + + /** + * Returns the line number of the matching directive for the specified + * URL and user-agent if any. + * + * The line numbers start at 1 and go up (1-based indexing). + * + * Return -1 if there is no matching directive. If a rule is manually + * added without a lineNumber then this will return undefined for that + * rule. + * + * @param {string} url + * @param {string?} ua + * @return {number?} + */ + Robots.prototype.getMatchingLineNumber = function (url, ua) { + var rule = this._getRule(url, ua, false); + + return rule ? rule.lineNumber : -1; + }; + + /** + * Returns the opposite of isAllowed() + * + * @param {string} url + * @param {string?} ua + * @return {boolean} + */ + Robots.prototype.isDisallowed = function (url, ua) { + return !this.isAllowed(url, ua); + }; + + /** + * Returns trues if explicitly disallowed + * for the specified user agent (User Agent wildcards are discarded). + * + * This will return undefined if the URL is not valid for this robots.txt file. + * + * @param {string} url + * @param {string} ua + * @return {boolean?} + */ + Robots.prototype.isExplicitlyDisallowed = function (url, ua) { + var rule = this._getRule(url, ua, true); + if (typeof rule === 'undefined') { + return; + } + + return !(!rule || rule.allow); + }; + + /** + * Gets the crawl delay if there is one. + * + * Will return undefined if there is no crawl delay set. + * + * @param {string} ua + * @return {number?} + */ + Robots.prototype.getCrawlDelay = function (ua) { + var userAgent = formatUserAgent(ua || '*'); + + return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay; + }; + + /** + * Returns the preferred host if there is one. + * + * @return {string?} + */ + Robots.prototype.getPreferredHost = function () { + return this._preferredHost; + }; + + /** + * Returns an array of sitemap URLs if there are any. + * + * @return {Array.} + */ + Robots.prototype.getSitemaps = function () { + return this._sitemaps.slice(0); + }; + + return Robots; +})(); + +if (typeof module !== 'undefined' && module.exports) { + module.exports = Robots; } - -/** - * Remove comments from lines - * - * @param {string} line - * @return {string} - * @private - */ -function removeComments(line) { - var commentStartIndex = line.indexOf('#'); - if (commentStartIndex > -1) { - return line.substr(0, commentStartIndex); - } - - return line; -} - -/** - * Splits a line at the first occurrence of : - * - * @param {string} line - * @return {Array.} - * @private - */ -function splitLine(line) { - var idx = String(line).indexOf(':'); - - if (!line || idx < 0) { - return null; - } - - return [line.slice(0, idx), line.slice(idx + 1)]; -} - -/** - * Normalises the user-agent string by converting it to - * lower case and removing any version numbers. - * - * @param {string} userAgent - * @return {string} - * @private - */ -function formatUserAgent(userAgent) { - var formattedUserAgent = userAgent.toLowerCase(); - - // Strip the version number from robot/1.0 user agents - var idx = formattedUserAgent.indexOf('/'); - if (idx > -1) { - formattedUserAgent = formattedUserAgent.substr(0, idx); - } - - return formattedUserAgent.trim(); -} - -/** - * Normalises the URL encoding of a path by encoding - * unicode characters. - * - * @param {string} path - * @return {string} - * @private - */ -function normaliseEncoding(path) { - try { - return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%')); - } catch (e) { - return path; - } -} - -/** - * Convert URL encodings to support case. - * - * e.g.: %2a%ef becomes %2A%EF - * - * @param {string} path - * @return {string} - * @private - */ -function urlEncodeToUpper(path) { - return path.replace(/%[0-9a-fA-F]{2}/g, function (match) { - return match.toUpperCase(); - }); -} - -/** - * Matches a pattern with the specified path - * - * Uses same algorithm to match patterns as the Google implementation in - * google/robotstxt so it should be consistent with the spec. - * - * @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74 - * @param {string} pattern - * @param {string} path - * @return {boolean} - * @private - */ -function matches(pattern, path) { - // I've added extra comments to try make this easier to understand - - // Stores the lengths of all the current matching substrings. - // Maximum number of possible matching lengths is every length in path plus - // 1 to handle 0 length too (if pattern starts with * which is zero or more) - var matchingLengths = new Array(path.length + 1); - var numMatchingLengths = 1; - - // Initially longest match is 0 - matchingLengths[0] = 0; - - for (var p = 0; p < pattern.length; p++) { - // If $ is at the end of pattern then we must match the whole path. - // Which is true if the longest matching length matches path length - if (pattern[p] === '$' && p + 1 === pattern.length) { - return matchingLengths[numMatchingLengths - 1] === path.length; - } - - // Handle wildcards - if (pattern[p] == '*') { - // Wildcard so all substrings minus the current smallest matching - // length are matches - numMatchingLengths = path.length - matchingLengths[0] + 1; - - // Update matching lengths to include the smallest all the way up - // to numMatchingLengths - // Don't update smallest possible match as * matches zero or more - // so the smallest current match is also valid - for (var i = 1; i < numMatchingLengths; i++) { - matchingLengths[i] = matchingLengths[i - 1] + 1; - } - } else { - // Check the char at the matching length matches the pattern, if it - // does increment it and add it as a valid length, ignore if not. - var numMatches = 0; - for (var i = 0; i < numMatchingLengths; i++) { - if ( - matchingLengths[i] < path.length && - path[matchingLengths[i]] === pattern[p] - ) { - matchingLengths[numMatches++] = matchingLengths[i] + 1; - } - } - - // No paths matched the current pattern char so not a match - if (numMatches == 0) { - return false; - } - - numMatchingLengths = numMatches; - } - } - - return true; -} - -function parseRobots(contents, robots) { - var newlineRegex = /\r\n|\r|\n/; - var lines = contents - .split(newlineRegex) - .map(removeComments) - .map(splitLine) - .map(trimLine); - - var currentUserAgents = []; - var isNoneUserAgentState = true; - for (var i = 0; i < lines.length; i++) { - var line = lines[i]; - - if (!line || !line[0]) { - continue; - } - - switch (line[0].toLowerCase()) { - case 'user-agent': - if (isNoneUserAgentState) { - currentUserAgents.length = 0; - } - - if (line[1]) { - currentUserAgents.push(formatUserAgent(line[1])); - } - break; - case 'disallow': - robots.addRule(currentUserAgents, line[1], false, i + 1); - break; - case 'allow': - robots.addRule(currentUserAgents, line[1], true, i + 1); - break; - case 'crawl-delay': - robots.setCrawlDelay(currentUserAgents, line[1]); - break; - case 'sitemap': - if (line[1]) { - robots.addSitemap(line[1]); - } - break; - case 'host': - if (line[1]) { - robots.setPreferredHost(line[1].toLowerCase()); - } - break; - } - - isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent'; - } -} - -/** - * Returns if a pattern is allowed by the specified rules. - * - * @param {string} path - * @param {Array.} rules - * @return {Object?} - * @private - */ -function findRule(path, rules) { - var matchedRule = null; - - for (var i = 0; i < rules.length; i++) { - var rule = rules[i]; - - if (!matches(rule.pattern, path)) { - continue; - } - - // The longest matching rule takes precedence - // If rules are the same length then allow takes precedence - if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) { - matchedRule = rule; - } else if ( - rule.pattern.length == matchedRule.pattern.length && - rule.allow && - !matchedRule.allow - ) { - matchedRule = rule; - } - } - - return matchedRule; -} - -/** - * Converts provided string into an URL object. - * - * Will return null if provided string is not a valid URL. - * - * @param {string} url - * @return {?URL} - * @private - */ -function parseUrl(url) { - try { - // Specify a URL to be used with relative paths - // Using non-existent subdomain so can never cause conflict unless - // trying to crawl it but doesn't exist and even if tried worst that can - // happen is it allows relative URLs on it. - var url = new URL(url, 'http://robots-relative.samclarke.com/'); - - if (!url.port) { - url.port = url.protocol === 'https:' ? 443 : 80; - } - - return url; - } catch (e) { - return null; - } -} - -function Robots(url, contents) { - this._url = parseUrl(url) || {}; - this._rules = Object.create(null); - this._sitemaps = []; - this._preferredHost = null; - - parseRobots(contents || '', this); -} - -/** - * Adds the specified allow/deny rule to the rules - * for the specified user-agents. - * - * @param {Array.} userAgents - * @param {string} pattern - * @param {boolean} allow - * @param {number} [lineNumber] Should use 1-based indexing - */ -Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) { - var rules = this._rules; - - userAgents.forEach(function (userAgent) { - rules[userAgent] = rules[userAgent] || []; - - if (!pattern) { - return; - } - - rules[userAgent].push({ - pattern: normaliseEncoding(pattern), - allow: allow, - lineNumber: lineNumber - }); - }); -}; - -/** - * Adds the specified delay to the specified user agents. - * - * @param {Array.} userAgents - * @param {string} delayStr - */ -Robots.prototype.setCrawlDelay = function (userAgents, delayStr) { - var rules = this._rules; - var delay = Number(delayStr); - - userAgents.forEach(function (userAgent) { - rules[userAgent] = rules[userAgent] || []; - - if (isNaN(delay)) { - return; - } - - rules[userAgent].crawlDelay = delay; - }); -}; - -/** - * Add a sitemap - * - * @param {string} url - */ -Robots.prototype.addSitemap = function (url) { - this._sitemaps.push(url); -}; - -/** - * Sets the preferred host name - * - * @param {string} url - */ -Robots.prototype.setPreferredHost = function (url) { - this._preferredHost = url; -}; - -Robots.prototype._getRule = function (url, ua, explicit) { - var parsedUrl = parseUrl(url) || {}; - var userAgent = formatUserAgent(ua || '*'); - - // The base URL must match otherwise this robots.txt is not valid for it. - if ( - parsedUrl.protocol !== this._url.protocol || - parsedUrl.hostname !== this._url.hostname || - parsedUrl.port !== this._url.port - ) { - return; - } - - var rules = this._rules[userAgent]; - if (!explicit) { - rules = rules || this._rules['*']; - } - rules = rules || []; - - var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); - var rule = findRule(path, rules); - - return rule; -}; - -/** - * Returns true if allowed, false if not allowed. - * - * Will return undefined if the URL is not valid for - * this robots.txt file. - * - * @param {string} url - * @param {string?} ua - * @return {boolean?} - */ -Robots.prototype.isAllowed = function (url, ua) { - var rule = this._getRule(url, ua, false); - - if (typeof rule === 'undefined') { - return; - } - - return !rule || rule.allow; -}; - -/** - * Returns the line number of the matching directive for the specified - * URL and user-agent if any. - * - * The line numbers start at 1 and go up (1-based indexing). - * - * Return -1 if there is no matching directive. If a rule is manually - * added without a lineNumber then this will return undefined for that - * rule. - * - * @param {string} url - * @param {string?} ua - * @return {number?} - */ -Robots.prototype.getMatchingLineNumber = function (url, ua) { - var rule = this._getRule(url, ua, false); - - return rule ? rule.lineNumber : -1; -}; - -/** - * Returns the opposite of isAllowed() - * - * @param {string} url - * @param {string?} ua - * @return {boolean} - */ -Robots.prototype.isDisallowed = function (url, ua) { - return !this.isAllowed(url, ua); -}; - -/** - * Returns trues if explicitly disallowed - * for the specified user agent (User Agent wildcards are discarded). - * - * This will return undefined if the URL is not valid for this robots.txt file. - * - * @param {string} url - * @param {string} ua - * @return {boolean?} - */ -Robots.prototype.isExplicitlyDisallowed = function (url, ua) { - var rule = this._getRule(url, ua, true); - if (typeof rule === 'undefined') { - return; - } - - return !(!rule || rule.allow); -}; - -/** - * Gets the crawl delay if there is one. - * - * Will return undefined if there is no crawl delay set. - * - * @param {string} ua - * @return {number?} - */ -Robots.prototype.getCrawlDelay = function (ua) { - var userAgent = formatUserAgent(ua || '*'); - - return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay; -}; - -/** - * Returns the preferred host if there is one. - * - * @return {string?} - */ -Robots.prototype.getPreferredHost = function () { - return this._preferredHost; -}; - -/** - * Returns an array of sitemap URLs if there are any. - * - * @return {Array.} - */ -Robots.prototype.getSitemaps = function () { - return this._sitemaps.slice(0); -}; - -module.exports = Robots; \ No newline at end of file diff --git a/tests/robots.js b/tests/robots.js index dd501c38..7a138549 100644 --- a/tests/robots.js +++ b/tests/robots.js @@ -1,903 +1,912 @@ -var robotsParser = require('../index'); -var expect = require('chai').expect; +// Unit tests of the robots parser +// Source: +// Copyright (c) 2014 Sam Clarke +// MIT License (MIT) +// Run with `npx nyc --reporter=text-summary --reporter=html --reporter=lcovonly mocha tests/robots.js` + +// Set up the test environment with Antville’s version of the robots parser +const Robots = require('../code/Global/Robots.js'); +const robotsParser = (url, contents) => new Robots(url, contents); + +const { expect } = require('chai'); function testRobots(url, contents, allowed, disallowed) { - var robots = robotsParser(url, contents); + var robots = robotsParser(url, contents); - allowed.forEach(function (url) { - expect(robots.isAllowed(url)).to.equal(true); - }); + allowed.forEach(function (url) { + expect(robots.isAllowed(url)).to.equal(true); + }); - disallowed.forEach(function (url) { - expect(robots.isDisallowed(url)).to.equal(true); - }); + disallowed.forEach(function (url) { + expect(robots.isDisallowed(url)).to.equal(true); + }); } describe('Robots', function () { - it('should parse the disallow directive', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should parse the allow directive', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html', - 'Allow: /fish/test.html', - 'Allow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/fish/test.html', - 'http://www.example.com/Test.html', - 'http://www.example.com/test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should parse patterns', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish*.php', - 'Disallow: /*.dext$', - 'Disallow: /dir*' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/Fish.PHP', - 'http://www.example.com/Fish.dext1', - 'http://www.example.com/folder/dir.html', - 'http://www.example.com/folder/dir/test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish.php', - 'http://www.example.com/fishheads/catfish.php?parameters', - 'http://www.example.com/AnYthInG.dext', - 'http://www.example.com/Fish.dext.dext', - 'http://www.example.com/dir/test.html', - 'http://www.example.com/directory.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should have the correct order precedence for allow and disallow', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish*.php', - 'Allow: /fish/index.php', - 'Disallow: /test', - 'Allow: /test/', - 'Disallow: /aa/', - 'Allow: /aa/', - 'Allow: /bb/', - 'Disallow: /bb/', - ].join('\n'); - - var allowed = [ - 'http://www.example.com/test/index.html', - 'http://www.example.com/fish/index.php', - 'http://www.example.com/test/', - 'http://www.example.com/aa/', - 'http://www.example.com/bb/', - 'http://www.example.com/x/' - ]; - - var disallowed = [ - 'http://www.example.com/fish.php', - 'http://www.example.com/fishheads/catfish.php?parameters', - 'http://www.example.com/test' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should have the correct order precedence for wildcards', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /*/', - 'Allow: /x/', - ].join('\n'); - - var allowed = [ - 'http://www.example.com/x/', - 'http://www.example.com/fish.php', - 'http://www.example.com/test' - ]; - - var disallowed = [ - 'http://www.example.com/a/', - 'http://www.example.com/xx/', - 'http://www.example.com/test/index.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should parse lines delimitated by \\r', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\r'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should parse lines delimitated by \\r\\n', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\r\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - - it('should parse lines delimitated by mixed line endings', function () { - var contents = [ - 'User-agent: *\r', - 'Disallow: /fish/\r\n', - 'Disallow: /test.html\n\n' - ].join(''); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should ignore rules that are not in a group', function () { - var contents = [ - 'Disallow: /secret.html', - 'Disallow: /test', - ].join('\n'); - - var allowed = [ - 'http://www.example.com/secret.html', - 'http://www.example.com/test/index.html', - 'http://www.example.com/test/' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, []); - }); - - - it('should ignore comments', function () { - var contents = [ - '#', - '# This is a comment', - '#', - 'User-agent: *', - '# This is a comment', - 'Disallow: /fish/ # ignore', - '# Disallow: fish', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should ignore invalid lines', function () { - var contents = [ - 'invalid line', - 'User-agent: *', - 'Disallow: /fish/', - ':::::another invalid line:::::', - 'Disallow: /test.html', - 'Unknown: tule' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should ignore empty user-agent lines', function () { - var contents = [ - 'User-agent:', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html', - 'http://www.example.com/fish/index.php', - 'http://www.example.com/fish/', - 'http://www.example.com/test.html' - ]; - - var disallowed = []; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should support groups with multiple user agents (case insensitive)', function () { - var contents = [ - 'User-agent: agenta', - 'User-agent: agentb', - 'Disallow: /fish', - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false); - }); - - it('should return undefined for invalid urls', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /secret.html', - 'Disallow: /test', - ].join('\n'); - - var invalidUrls = [ - 'http://example.com/secret.html', - 'http://ex ample.com/secret.html', - 'http://www.example.net/test/index.html', - 'http://www.examsple.com/test/', - 'example.com/test/', - ':::::;;`\\|/.example.com/test/' - ]; - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - invalidUrls.forEach(function (url) { - expect(robots.isAllowed(url)).to.equal(undefined); - }); - }); - - it('should handle Unicode, urlencoded and punycode URLs', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /secret.html', - 'Disallow: /test', - ].join('\n'); - - var allowed = [ - 'http://www.münich.com/index.html', - 'http://www.xn--mnich-kva.com/index.html', - 'http://www.m%C3%BCnich.com/index.html' - ]; - - var disallowed = [ - 'http://www.münich.com/secret.html', - 'http://www.xn--mnich-kva.com/secret.html', - 'http://www.m%C3%BCnich.com/secret.html' - ]; - - testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed); - testRobots('http://www.xn--mnich-kva.com/robots.txt', contents, allowed, disallowed); - testRobots('http://www.m%C3%BCnich.com/robots.txt', contents, allowed, disallowed); - }); - - it('should handle Unicode and urlencoded paths', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /%CF%80', - 'Disallow: /%e2%9d%83', - 'Disallow: /%a%a', - 'Disallow: /💩', - 'Disallow: /✼*t$', - 'Disallow: /%E2%9C%A4*t$', - 'Disallow: /✿%a', - 'Disallow: /http%3A%2F%2Fexample.org' - ].join('\n'); - - var allowed = [ - 'http://www.example.com/✼testing', - 'http://www.example.com/%E2%9C%BCtesting', - 'http://www.example.com/✤testing', - 'http://www.example.com/%E2%9C%A4testing', - 'http://www.example.com/http://example.org', - 'http://www.example.com/http:%2F%2Fexample.org' - ]; - - var disallowed = [ - 'http://www.example.com/%CF%80', - 'http://www.example.com/%CF%80/index.html', - 'http://www.example.com/π', - 'http://www.example.com/π/index.html', - 'http://www.example.com/%e2%9d%83', - 'http://www.example.com/%E2%9D%83/index.html', - 'http://www.example.com/❃', - 'http://www.example.com/❃/index.html', - 'http://www.example.com/%F0%9F%92%A9', - 'http://www.example.com/%F0%9F%92%A9/index.html', - 'http://www.example.com/💩', - 'http://www.example.com/💩/index.html', - 'http://www.example.com/%a%a', - 'http://www.example.com/%a%a/index.html', - 'http://www.example.com/✼test', - 'http://www.example.com/%E2%9C%BCtest', - 'http://www.example.com/✤test', - 'http://www.example.com/%E2%9C%A4testt', - 'http://www.example.com/✿%a', - 'http://www.example.com/%E2%9C%BF%atest', - 'http://www.example.com/http%3A%2F%2Fexample.org' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should handle lone high / low surrogates', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /\uD800', - 'Disallow: /\uDC00' - ].join('\n'); - - // These are invalid so can't be disallowed - var allowed = [ - 'http://www.example.com/\uDC00', - 'http://www.example.com/\uD800' - ]; - - var disallowed = []; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should ignore host case', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /secret.html', - 'Disallow: /test', - ].join('\n'); - - var allowed = [ - 'http://www.example.com/index.html', - 'http://www.ExAmPlE.com/index.html', - 'http://www.EXAMPLE.com/index.html' - ]; - - var disallowed = [ - 'http://www.example.com/secret.html', - 'http://www.ExAmPlE.com/secret.html', - 'http://www.EXAMPLE.com/secret.html' - ]; - - testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed); - }); - - it('should handle relative paths', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish', - 'Allow: /fish/test', - ].join('\n'); - - var robots = robotsParser('/robots.txt', contents); - expect(robots.isAllowed('/fish/test')).to.equal(true); - expect(robots.isAllowed('/fish')).to.equal(false); - }); - - it('should not allow relative paths if domain specified', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish', - 'Allow: /fish/test', - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - expect(robots.isAllowed('/fish/test')).to.equal(undefined); - expect(robots.isAllowed('/fish')).to.equal(undefined); - }); - - it('should not treat invalid robots.txt URLs as relative', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish', - 'Allow: /fish/test', - ].join('\n'); - - var robots = robotsParser('https://ex ample.com/robots.txt', contents); - expect(robots.isAllowed('/fish/test')).to.equal(undefined); - expect(robots.isAllowed('/fish')).to.equal(undefined); - }); - - it('should not allow URls if domain specified and robots.txt is relative', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish', - 'Allow: /fish/test', - ].join('\n'); - - var robots = robotsParser('/robots.txt', contents); - expect(robots.isAllowed('http://www.example.com/fish/test')).to.equal(undefined); - expect(robots.isAllowed('http://www.example.com/fish')).to.equal(undefined); - }); - - it('should allow all if empty robots.txt', function () { - var allowed = [ - 'http://www.example.com/secret.html', - 'http://www.example.com/test/index.html', - 'http://www.example.com/test/' - ]; - - var robots = robotsParser('http://www.example.com/robots.txt', ''); - - allowed.forEach(function (url) { - expect(robots.isAllowed(url)).to.equal(true); - }); - }); - - it('should treat null as allowing all', function () { - var robots = robotsParser('http://www.example.com/robots.txt', null); - - expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true); - expect(robots.isAllowed("http://www.example.com/")).to.equal(true); - }); - - it('should handle invalid robots.txt urls', function () { - var contents = [ - 'user-agent: *', - 'disallow: /', - - 'host: www.example.com', - 'sitemap: /sitemap.xml' - ].join('\n'); - - var sitemapUrls = [ - undefined, - null, - 'null', - ':/wom/test/' - ]; - - sitemapUrls.forEach(function (url) { - var robots = robotsParser(url, contents); - expect(robots.isAllowed('http://www.example.com/index.html')).to.equal(undefined); - expect(robots.getPreferredHost()).to.equal('www.example.com'); - expect(robots.getSitemaps()).to.eql(['/sitemap.xml']); - }); - }); - - it('should parse the crawl-delay directive', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1', - - 'user-agent: b', - 'disallow: /d', - - 'user-agent: c', - 'user-agent: d', - 'crawl-delay: 10' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getCrawlDelay('a')).to.equal(1); - expect(robots.getCrawlDelay('b')).to.equal(undefined); - expect(robots.getCrawlDelay('c')).to.equal(10); - expect(robots.getCrawlDelay('d')).to.equal(10); - expect(robots.getCrawlDelay()).to.equal(undefined); - }); - - it('should ignore invalid crawl-delay directives', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1.2.1', - - 'user-agent: b', - 'crawl-delay: 1.a0', - - 'user-agent: c', - 'user-agent: d', - 'crawl-delay: 10a' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getCrawlDelay('a')).to.equal(undefined); - expect(robots.getCrawlDelay('b')).to.equal(undefined); - expect(robots.getCrawlDelay('c')).to.equal(undefined); - expect(robots.getCrawlDelay('d')).to.equal(undefined); - }); - - it('should parse the sitemap directive', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1', - 'sitemap: http://example.com/test.xml', - - 'user-agent: b', - 'disallow: /d', - - 'sitemap: /sitemap.xml', - 'sitemap: http://example.com/test/sitemap.xml ' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getSitemaps()).to.eql([ - 'http://example.com/test.xml', - '/sitemap.xml', - 'http://example.com/test/sitemap.xml' - ]); - }); - - it('should parse the host directive', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1', - 'host: www.example.net', - - 'user-agent: b', - 'disallow: /d', - - 'host: example.com' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getPreferredHost()).to.equal('example.com'); - }); - - it('should parse empty and invalid directives', function () { - var contents = [ - 'user-agent:', - 'user-agent:::: a::', - 'crawl-delay:', - 'crawl-delay:::: 0:', - 'host:', - 'host:: example.com', - 'sitemap:', - 'sitemap:: site:map.xml', - 'disallow:', - 'disallow::: /:', - 'allow:', - 'allow::: /:', - ].join('\n'); - - robotsParser('http://www.example.com/robots.txt', contents); - }); - - it('should treat only the last host directive as valid', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1', - 'host: www.example.net', - - 'user-agent: b', - 'disallow: /d', - - 'host: example.net', - 'host: example.com' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getPreferredHost()).to.equal('example.com'); - }); - - it('should return null when there is no host directive', function () { - var contents = [ - 'user-agent: a', - 'crawl-delay: 1', - - 'user-agent: b', - 'disallow: /d', - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getPreferredHost()).to.equal(null); - }); - - it('should fallback to * when a UA has no rules of its own', function () { - var contents = [ - 'user-agent: *', - 'crawl-delay: 1', - - 'user-agent: b', - 'crawl-delay: 12', - - 'user-agent: c', - 'user-agent: d', - 'crawl-delay: 10' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getCrawlDelay('should-fall-back')).to.equal(1); - expect(robots.getCrawlDelay('d')).to.equal(10); - expect(robots.getCrawlDelay('dd')).to.equal(1); - }); - - it('should not fallback to * when a UA has rules', function () { - var contents = [ - 'user-agent: *', - 'crawl-delay: 1', - - 'user-agent: b', - 'disallow:' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getCrawlDelay('b')).to.equal(undefined); - }); - - it('should handle UAs with object property names', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish', - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - expect(robots.isAllowed('http://www.example.com/fish', 'constructor')).to.equal(false); - expect(robots.isAllowed('http://www.example.com/fish', '__proto__')).to.equal(false); - }); - - it('should ignore version numbers in the UA string', function () { - var contents = [ - 'user-agent: *', - 'crawl-delay: 1', - - 'user-agent: b', - 'crawl-delay: 12', - - 'user-agent: c', - 'user-agent: d', - 'crawl-delay: 10' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getCrawlDelay('should-fall-back/1.0.0')).to.equal(1); - expect(robots.getCrawlDelay('d/12')).to.equal(10); - expect(robots.getCrawlDelay('dd / 0-32-3')).to.equal(1); - expect(robots.getCrawlDelay('b / 1.0')).to.equal(12); - }); - - - it('should return the line number of the matching directive', function () { - var contents = [ - '', - 'User-agent: *', - '', - 'Disallow: /fish/', - 'Disallow: /test.html', - 'Allow: /fish/test.html', - 'Allow: /test.html', - '', - 'User-agent: a', - 'allow: /', - '', - 'User-agent: b', - 'disallow: /test', - 'disallow: /t*t', - '', - 'User-agent: c', - 'Disallow: /fish*.php', - 'Allow: /fish/index.php' - ].join('\n'); - - var robots = robotsParser('http://www.example.com/robots.txt', contents); - - expect(robots.getMatchingLineNumber('http://www.example.com/fish')).to.equal(-1); - expect(robots.getMatchingLineNumber('http://www.example.com/fish/test.html')).to.equal(6); - expect(robots.getMatchingLineNumber('http://www.example.com/Test.html')).to.equal(-1); - - expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php')).to.equal(4); - expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4); - expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7); - - expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10); - - expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17); - expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18); - }); - - it('should handle large wildcards efficiently', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /' + '*'.repeat(2048) + '.html', - ].join('\n'); - - var allowed = [ - 'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php', - ]; - - var disallowed = [ - 'http://www.example.com/secret.html' - ]; - - const start = Date.now(); - testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed); - const end = Date.now(); - - // Should take less than 500 ms (high to allow for variableness of - // machines running the test, should normally be much less) - expect(end - start).to.be.lessThan(500); - }); - - it('should honor given port number', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com:8080/fish', - 'http://www.example.com:8080/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com/fish', - 'http://www.example.com/Test.html', - 'http://www.example.com:80/fish', - 'http://www.example.com:80/Test.html' - ]; - - testRobots('http://www.example.com:8080/robots.txt', contents, allowed, disallowed); - }); - - it('should default to port 80 for http: if no port given', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'http://www.example.com:80/fish', - 'http://www.example.com:80/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com:443/fish', - 'http://www.example.com:443/Test.html', - 'http://www.example.com:80/fish/index.php', - 'http://www.example.com:80/fish/', - 'http://www.example.com:80/test.html' - ]; - - testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should default to port 443 for https: if no port given', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /fish/', - 'Disallow: /test.html' - ].join('\n'); - - var allowed = [ - 'https://www.example.com:443/fish', - 'https://www.example.com:443/Test.html', - 'https://www.example.com/fish', - 'https://www.example.com/Test.html' - ]; - - var disallowed = [ - 'http://www.example.com:80/fish', - 'http://www.example.com:80/Test.html', - 'http://www.example.com:443/fish/index.php', - 'http://www.example.com:443/fish/', - 'http://www.example.com:443/test.html' - ]; - - testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed); - }); - - it('should not be disallowed when wildcard is used in explicit mode', function () { - var contents = [ - 'User-agent: *', - 'Disallow: /', - ].join('\n') - - var url = 'https://www.example.com/hello' - var userAgent = 'SomeBot'; - var robots = robotsParser(url, contents); - - expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) - }); - - it('should be disallowed when user agent equal robots rule in explicit mode', function () { - var contents = [ - 'User-agent: SomeBot', - 'Disallow: /', - ].join('\n') - - var url = 'https://www.example.com/hello' - var userAgent = 'SomeBot'; - var robots = robotsParser(url, contents); - - expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) - }); - - it('should return undefined when given an invalid URL in explicit mode', function () { - var contents = [ - 'User-agent: SomeBot', - 'Disallow: /', - ].join('\n') - - var url = 'https://www.example.com/hello' - var userAgent = 'SomeBot'; - var robots = robotsParser('http://example.com', contents); - - expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined) - }); + it('should parse the disallow directive', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should parse the allow directive', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html', + 'Allow: /fish/test.html', + 'Allow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/fish/test.html', + 'http://www.example.com/Test.html', + 'http://www.example.com/test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should parse patterns', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish*.php', + 'Disallow: /*.dext$', + 'Disallow: /dir*' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/Fish.PHP', + 'http://www.example.com/Fish.dext1', + 'http://www.example.com/folder/dir.html', + 'http://www.example.com/folder/dir/test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish.php', + 'http://www.example.com/fishheads/catfish.php?parameters', + 'http://www.example.com/AnYthInG.dext', + 'http://www.example.com/Fish.dext.dext', + 'http://www.example.com/dir/test.html', + 'http://www.example.com/directory.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should have the correct order precedence for allow and disallow', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish*.php', + 'Allow: /fish/index.php', + 'Disallow: /test', + 'Allow: /test/', + 'Disallow: /aa/', + 'Allow: /aa/', + 'Allow: /bb/', + 'Disallow: /bb/', + ].join('\n'); + + var allowed = [ + 'http://www.example.com/test/index.html', + 'http://www.example.com/fish/index.php', + 'http://www.example.com/test/', + 'http://www.example.com/aa/', + 'http://www.example.com/bb/', + 'http://www.example.com/x/' + ]; + + var disallowed = [ + 'http://www.example.com/fish.php', + 'http://www.example.com/fishheads/catfish.php?parameters', + 'http://www.example.com/test' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should have the correct order precedence for wildcards', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /*/', + 'Allow: /x/', + ].join('\n'); + + var allowed = [ + 'http://www.example.com/x/', + 'http://www.example.com/fish.php', + 'http://www.example.com/test' + ]; + + var disallowed = [ + 'http://www.example.com/a/', + 'http://www.example.com/xx/', + 'http://www.example.com/test/index.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should parse lines delimitated by \\r', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\r'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should parse lines delimitated by \\r\\n', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\r\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + + it('should parse lines delimitated by mixed line endings', function () { + var contents = [ + 'User-agent: *\r', + 'Disallow: /fish/\r\n', + 'Disallow: /test.html\n\n' + ].join(''); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should ignore rules that are not in a group', function () { + var contents = [ + 'Disallow: /secret.html', + 'Disallow: /test', + ].join('\n'); + + var allowed = [ + 'http://www.example.com/secret.html', + 'http://www.example.com/test/index.html', + 'http://www.example.com/test/' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, []); + }); + + + it('should ignore comments', function () { + var contents = [ + '#', + '# This is a comment', + '#', + 'User-agent: *', + '# This is a comment', + 'Disallow: /fish/ # ignore', + '# Disallow: fish', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should ignore invalid lines', function () { + var contents = [ + 'invalid line', + 'User-agent: *', + 'Disallow: /fish/', + ':::::another invalid line:::::', + 'Disallow: /test.html', + 'Unknown: tule' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should ignore empty user-agent lines', function () { + var contents = [ + 'User-agent:', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html', + 'http://www.example.com/fish/index.php', + 'http://www.example.com/fish/', + 'http://www.example.com/test.html' + ]; + + var disallowed = []; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should support groups with multiple user agents (case insensitive)', function () { + var contents = [ + 'User-agent: agenta', + 'User-agent: agentb', + 'Disallow: /fish', + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false); + }); + + it('should return undefined for invalid urls', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /secret.html', + 'Disallow: /test', + ].join('\n'); + + var invalidUrls = [ + 'http://example.com/secret.html', + 'http://ex ample.com/secret.html', + 'http://www.example.net/test/index.html', + 'http://www.examsple.com/test/', + 'example.com/test/', + ':::::;;`\\|/.example.com/test/' + ]; + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + invalidUrls.forEach(function (url) { + expect(robots.isAllowed(url)).to.equal(undefined); + }); + }); + + it('should handle Unicode, urlencoded and punycode URLs', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /secret.html', + 'Disallow: /test', + ].join('\n'); + + var allowed = [ + 'http://www.münich.com/index.html', + 'http://www.xn--mnich-kva.com/index.html', + 'http://www.m%C3%BCnich.com/index.html' + ]; + + var disallowed = [ + 'http://www.münich.com/secret.html', + 'http://www.xn--mnich-kva.com/secret.html', + 'http://www.m%C3%BCnich.com/secret.html' + ]; + + testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed); + testRobots('http://www.xn--mnich-kva.com/robots.txt', contents, allowed, disallowed); + testRobots('http://www.m%C3%BCnich.com/robots.txt', contents, allowed, disallowed); + }); + + it('should handle Unicode and urlencoded paths', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /%CF%80', + 'Disallow: /%e2%9d%83', + 'Disallow: /%a%a', + 'Disallow: /💩', + 'Disallow: /✼*t$', + 'Disallow: /%E2%9C%A4*t$', + 'Disallow: /✿%a', + 'Disallow: /http%3A%2F%2Fexample.org' + ].join('\n'); + + var allowed = [ + 'http://www.example.com/✼testing', + 'http://www.example.com/%E2%9C%BCtesting', + 'http://www.example.com/✤testing', + 'http://www.example.com/%E2%9C%A4testing', + 'http://www.example.com/http://example.org', + 'http://www.example.com/http:%2F%2Fexample.org' + ]; + + var disallowed = [ + 'http://www.example.com/%CF%80', + 'http://www.example.com/%CF%80/index.html', + 'http://www.example.com/π', + 'http://www.example.com/π/index.html', + 'http://www.example.com/%e2%9d%83', + 'http://www.example.com/%E2%9D%83/index.html', + 'http://www.example.com/❃', + 'http://www.example.com/❃/index.html', + 'http://www.example.com/%F0%9F%92%A9', + 'http://www.example.com/%F0%9F%92%A9/index.html', + 'http://www.example.com/💩', + 'http://www.example.com/💩/index.html', + 'http://www.example.com/%a%a', + 'http://www.example.com/%a%a/index.html', + 'http://www.example.com/✼test', + 'http://www.example.com/%E2%9C%BCtest', + 'http://www.example.com/✤test', + 'http://www.example.com/%E2%9C%A4testt', + 'http://www.example.com/✿%a', + 'http://www.example.com/%E2%9C%BF%atest', + 'http://www.example.com/http%3A%2F%2Fexample.org' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should handle lone high / low surrogates', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /\uD800', + 'Disallow: /\uDC00' + ].join('\n'); + + // These are invalid so can't be disallowed + var allowed = [ + 'http://www.example.com/\uDC00', + 'http://www.example.com/\uD800' + ]; + + var disallowed = []; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should ignore host case', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /secret.html', + 'Disallow: /test', + ].join('\n'); + + var allowed = [ + 'http://www.example.com/index.html', + 'http://www.ExAmPlE.com/index.html', + 'http://www.EXAMPLE.com/index.html' + ]; + + var disallowed = [ + 'http://www.example.com/secret.html', + 'http://www.ExAmPlE.com/secret.html', + 'http://www.EXAMPLE.com/secret.html' + ]; + + testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed); + }); + + it('should handle relative paths', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish', + 'Allow: /fish/test', + ].join('\n'); + + var robots = robotsParser('/robots.txt', contents); + expect(robots.isAllowed('/fish/test')).to.equal(true); + expect(robots.isAllowed('/fish')).to.equal(false); + }); + + it('should not allow relative paths if domain specified', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish', + 'Allow: /fish/test', + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + expect(robots.isAllowed('/fish/test')).to.equal(undefined); + expect(robots.isAllowed('/fish')).to.equal(undefined); + }); + + it('should not treat invalid robots.txt URLs as relative', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish', + 'Allow: /fish/test', + ].join('\n'); + + var robots = robotsParser('https://ex ample.com/robots.txt', contents); + expect(robots.isAllowed('/fish/test')).to.equal(undefined); + expect(robots.isAllowed('/fish')).to.equal(undefined); + }); + + it('should not allow URls if domain specified and robots.txt is relative', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish', + 'Allow: /fish/test', + ].join('\n'); + + var robots = robotsParser('/robots.txt', contents); + expect(robots.isAllowed('http://www.example.com/fish/test')).to.equal(undefined); + expect(robots.isAllowed('http://www.example.com/fish')).to.equal(undefined); + }); + + it('should allow all if empty robots.txt', function () { + var allowed = [ + 'http://www.example.com/secret.html', + 'http://www.example.com/test/index.html', + 'http://www.example.com/test/' + ]; + + var robots = robotsParser('http://www.example.com/robots.txt', ''); + + allowed.forEach(function (url) { + expect(robots.isAllowed(url)).to.equal(true); + }); + }); + + it('should treat null as allowing all', function () { + var robots = robotsParser('http://www.example.com/robots.txt', null); + + expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true); + expect(robots.isAllowed("http://www.example.com/")).to.equal(true); + }); + + it('should handle invalid robots.txt urls', function () { + var contents = [ + 'user-agent: *', + 'disallow: /', + + 'host: www.example.com', + 'sitemap: /sitemap.xml' + ].join('\n'); + + var sitemapUrls = [ + undefined, + null, + 'null', + ':/wom/test/' + ]; + + sitemapUrls.forEach(function (url) { + var robots = robotsParser(url, contents); + expect(robots.isAllowed('http://www.example.com/index.html')).to.equal(undefined); + expect(robots.getPreferredHost()).to.equal('www.example.com'); + expect(robots.getSitemaps()).to.eql(['/sitemap.xml']); + }); + }); + + it('should parse the crawl-delay directive', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1', + + 'user-agent: b', + 'disallow: /d', + + 'user-agent: c', + 'user-agent: d', + 'crawl-delay: 10' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getCrawlDelay('a')).to.equal(1); + expect(robots.getCrawlDelay('b')).to.equal(undefined); + expect(robots.getCrawlDelay('c')).to.equal(10); + expect(robots.getCrawlDelay('d')).to.equal(10); + expect(robots.getCrawlDelay()).to.equal(undefined); + }); + + it('should ignore invalid crawl-delay directives', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1.2.1', + + 'user-agent: b', + 'crawl-delay: 1.a0', + + 'user-agent: c', + 'user-agent: d', + 'crawl-delay: 10a' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getCrawlDelay('a')).to.equal(undefined); + expect(robots.getCrawlDelay('b')).to.equal(undefined); + expect(robots.getCrawlDelay('c')).to.equal(undefined); + expect(robots.getCrawlDelay('d')).to.equal(undefined); + }); + + it('should parse the sitemap directive', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1', + 'sitemap: http://example.com/test.xml', + + 'user-agent: b', + 'disallow: /d', + + 'sitemap: /sitemap.xml', + 'sitemap: http://example.com/test/sitemap.xml ' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getSitemaps()).to.eql([ + 'http://example.com/test.xml', + '/sitemap.xml', + 'http://example.com/test/sitemap.xml' + ]); + }); + + it('should parse the host directive', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1', + 'host: www.example.net', + + 'user-agent: b', + 'disallow: /d', + + 'host: example.com' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getPreferredHost()).to.equal('example.com'); + }); + + it('should parse empty and invalid directives', function () { + var contents = [ + 'user-agent:', + 'user-agent:::: a::', + 'crawl-delay:', + 'crawl-delay:::: 0:', + 'host:', + 'host:: example.com', + 'sitemap:', + 'sitemap:: site:map.xml', + 'disallow:', + 'disallow::: /:', + 'allow:', + 'allow::: /:', + ].join('\n'); + + robotsParser('http://www.example.com/robots.txt', contents); + }); + + it('should treat only the last host directive as valid', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1', + 'host: www.example.net', + + 'user-agent: b', + 'disallow: /d', + + 'host: example.net', + 'host: example.com' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getPreferredHost()).to.equal('example.com'); + }); + + it('should return null when there is no host directive', function () { + var contents = [ + 'user-agent: a', + 'crawl-delay: 1', + + 'user-agent: b', + 'disallow: /d', + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getPreferredHost()).to.equal(null); + }); + + it('should fallback to * when a UA has no rules of its own', function () { + var contents = [ + 'user-agent: *', + 'crawl-delay: 1', + + 'user-agent: b', + 'crawl-delay: 12', + + 'user-agent: c', + 'user-agent: d', + 'crawl-delay: 10' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getCrawlDelay('should-fall-back')).to.equal(1); + expect(robots.getCrawlDelay('d')).to.equal(10); + expect(robots.getCrawlDelay('dd')).to.equal(1); + }); + + it('should not fallback to * when a UA has rules', function () { + var contents = [ + 'user-agent: *', + 'crawl-delay: 1', + + 'user-agent: b', + 'disallow:' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getCrawlDelay('b')).to.equal(undefined); + }); + + it('should handle UAs with object property names', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish', + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + expect(robots.isAllowed('http://www.example.com/fish', 'constructor')).to.equal(false); + expect(robots.isAllowed('http://www.example.com/fish', '__proto__')).to.equal(false); + }); + + it('should ignore version numbers in the UA string', function () { + var contents = [ + 'user-agent: *', + 'crawl-delay: 1', + + 'user-agent: b', + 'crawl-delay: 12', + + 'user-agent: c', + 'user-agent: d', + 'crawl-delay: 10' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getCrawlDelay('should-fall-back/1.0.0')).to.equal(1); + expect(robots.getCrawlDelay('d/12')).to.equal(10); + expect(robots.getCrawlDelay('dd / 0-32-3')).to.equal(1); + expect(robots.getCrawlDelay('b / 1.0')).to.equal(12); + }); + + + it('should return the line number of the matching directive', function () { + var contents = [ + '', + 'User-agent: *', + '', + 'Disallow: /fish/', + 'Disallow: /test.html', + 'Allow: /fish/test.html', + 'Allow: /test.html', + '', + 'User-agent: a', + 'allow: /', + '', + 'User-agent: b', + 'disallow: /test', + 'disallow: /t*t', + '', + 'User-agent: c', + 'Disallow: /fish*.php', + 'Allow: /fish/index.php' + ].join('\n'); + + var robots = robotsParser('http://www.example.com/robots.txt', contents); + + expect(robots.getMatchingLineNumber('http://www.example.com/fish')).to.equal(-1); + expect(robots.getMatchingLineNumber('http://www.example.com/fish/test.html')).to.equal(6); + expect(robots.getMatchingLineNumber('http://www.example.com/Test.html')).to.equal(-1); + + expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php')).to.equal(4); + expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4); + expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7); + + expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10); + + expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17); + expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18); + }); + + it('should handle large wildcards efficiently', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /' + '*'.repeat(2048) + '.html', + ].join('\n'); + + var allowed = [ + 'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php', + ]; + + var disallowed = [ + 'http://www.example.com/secret.html' + ]; + + const start = Date.now(); + testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed); + const end = Date.now(); + + // Should take less than 500 ms (high to allow for variableness of + // machines running the test, should normally be much less) + expect(end - start).to.be.lessThan(500); + }); + + it('should honor given port number', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com:8080/fish', + 'http://www.example.com:8080/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com/fish', + 'http://www.example.com/Test.html', + 'http://www.example.com:80/fish', + 'http://www.example.com:80/Test.html' + ]; + + testRobots('http://www.example.com:8080/robots.txt', contents, allowed, disallowed); + }); + + it('should default to port 80 for http: if no port given', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'http://www.example.com:80/fish', + 'http://www.example.com:80/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com:443/fish', + 'http://www.example.com:443/Test.html', + 'http://www.example.com:80/fish/index.php', + 'http://www.example.com:80/fish/', + 'http://www.example.com:80/test.html' + ]; + + testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should default to port 443 for https: if no port given', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /fish/', + 'Disallow: /test.html' + ].join('\n'); + + var allowed = [ + 'https://www.example.com:443/fish', + 'https://www.example.com:443/Test.html', + 'https://www.example.com/fish', + 'https://www.example.com/Test.html' + ]; + + var disallowed = [ + 'http://www.example.com:80/fish', + 'http://www.example.com:80/Test.html', + 'http://www.example.com:443/fish/index.php', + 'http://www.example.com:443/fish/', + 'http://www.example.com:443/test.html' + ]; + + testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed); + }); + + it('should not be disallowed when wildcard is used in explicit mode', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) + }); + + it('should be disallowed when user agent equal robots rule in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) + }); + + it('should return undefined when given an invalid URL in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser('http://example.com', contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined) + }); });