Merge branch 'main' into automatically-set-root-cookie

# Conflicts:
#	.github/workflows/deploy.yml
This commit is contained in:
Tobi Schäfer 2025-05-27 20:32:47 +02:00
commit a7fc1df893
Signed by: tobi
GPG key ID: 91FAE6FE2EBAC4C8
10 changed files with 1905 additions and 312 deletions

View file

@ -24,7 +24,6 @@ jobs:
run: ./gradlew :build
- name: Copy files to server
# The rsync command applies the same filters as the one in tools/extras/deploy.sh
run: |
rsync ./build/install/antville/ ${{ inputs.hostname }}:./apps/antville/ \
--archive --compress --delete --verbose \

543
code/Global/Robots.js Normal file
View file

@ -0,0 +1,543 @@
// Robots parser adapted for Rhino-compatible JavaScript
// Source: <https://github.com/samclarke/robots-parser>
// Copyright (c) 2014 Sam Clarke
// Copyright (c) 2025 Antville.org
// MIT License (MIT)
// Transformation steps:
// 1. Add IIFE around the code
// 2. Replace module.exports with return statement
// 3. Add conditional module.exports for CommonJS support
// 4. Add URL class imitation
var Robots = (() => {
/**
* Half-baked (read-only) imitation of the URL class of Node.js
*/
function nodeJsUrl(str, base) {
if (!str.includes('://')) {
str = (base || 'http://localhost') + str;
}
const url = new java.net.URL(str);
const port = url.port < 0 ? '' : url.port;
const userInfo = (url.getUserInfo() || "").split(':');
return {
hash: url.ref ? '#' + url.ref : '',
href: url.toString(),
host: url.host + (port ? ':' + port : port),
hostname: url.host,
password: userInfo[1] || "",
pathname: url.path,
origin: url.protocol + '://' + url.host + (port ? ':' + port : port),
port,
protocol: url.protocol,
search: url.queryy ? '?' + url.query : '',
searchParams: {
get: () => null,
set: () => null
},
username: userInfo[0] || "",
};
}
if (typeof URL === 'undefined') {
globalThis.URL = nodeJsUrl;
}
/**
* Trims the white space from the start and end of the line.
*
* If the line is an array it will strip the white space from
* the start and end of each element of the array.
*
* @param {string|Array} line
* @return {string|Array}
* @private
*/
function trimLine(line) {
if (!line) {
return null;
}
if (Array.isArray(line)) {
return line.map(trimLine);
}
return String(line).trim();
}
/**
* Remove comments from lines
*
* @param {string} line
* @return {string}
* @private
*/
function removeComments(line) {
var commentStartIndex = line.indexOf('#');
if (commentStartIndex > -1) {
return line.substr(0, commentStartIndex);
}
return line;
}
/**
* Splits a line at the first occurrence of :
*
* @param {string} line
* @return {Array.<string>}
* @private
*/
function splitLine(line) {
var idx = String(line).indexOf(':');
if (!line || idx < 0) {
return null;
}
return [line.slice(0, idx), line.slice(idx + 1)];
}
/**
* Normalises the user-agent string by converting it to
* lower case and removing any version numbers.
*
* @param {string} userAgent
* @return {string}
* @private
*/
function formatUserAgent(userAgent) {
var formattedUserAgent = userAgent.toLowerCase();
// Strip the version number from robot/1.0 user agents
var idx = formattedUserAgent.indexOf('/');
if (idx > -1) {
formattedUserAgent = formattedUserAgent.substr(0, idx);
}
return formattedUserAgent.trim();
}
/**
* Normalises the URL encoding of a path by encoding
* unicode characters.
*
* @param {string} path
* @return {string}
* @private
*/
function normaliseEncoding(path) {
try {
return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%'));
} catch (e) {
return path;
}
}
/**
* Convert URL encodings to support case.
*
* e.g.: %2a%ef becomes %2A%EF
*
* @param {string} path
* @return {string}
* @private
*/
function urlEncodeToUpper(path) {
return path.replace(/%[0-9a-fA-F]{2}/g, function (match) {
return match.toUpperCase();
});
}
/**
* Matches a pattern with the specified path
*
* Uses same algorithm to match patterns as the Google implementation in
* google/robotstxt so it should be consistent with the spec.
*
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
* @param {string} pattern
* @param {string} path
* @return {boolean}
* @private
*/
function matches(pattern, path) {
// I've added extra comments to try make this easier to understand
// Stores the lengths of all the current matching substrings.
// Maximum number of possible matching lengths is every length in path plus
// 1 to handle 0 length too (if pattern starts with * which is zero or more)
var matchingLengths = new Array(path.length + 1);
var numMatchingLengths = 1;
// Initially longest match is 0
matchingLengths[0] = 0;
for (var p = 0; p < pattern.length; p++) {
// If $ is at the end of pattern then we must match the whole path.
// Which is true if the longest matching length matches path length
if (pattern[p] === '$' && p + 1 === pattern.length) {
return matchingLengths[numMatchingLengths - 1] === path.length;
}
// Handle wildcards
if (pattern[p] == '*') {
// Wildcard so all substrings minus the current smallest matching
// length are matches
numMatchingLengths = path.length - matchingLengths[0] + 1;
// Update matching lengths to include the smallest all the way up
// to numMatchingLengths
// Don't update smallest possible match as * matches zero or more
// so the smallest current match is also valid
for (var i = 1; i < numMatchingLengths; i++) {
matchingLengths[i] = matchingLengths[i - 1] + 1;
}
} else {
// Check the char at the matching length matches the pattern, if it
// does increment it and add it as a valid length, ignore if not.
var numMatches = 0;
for (var i = 0; i < numMatchingLengths; i++) {
if (
matchingLengths[i] < path.length &&
path[matchingLengths[i]] === pattern[p]
) {
matchingLengths[numMatches++] = matchingLengths[i] + 1;
}
}
// No paths matched the current pattern char so not a match
if (numMatches == 0) {
return false;
}
numMatchingLengths = numMatches;
}
}
return true;
}
function parseRobots(contents, robots) {
var newlineRegex = /\r\n|\r|\n/;
var lines = contents
.split(newlineRegex)
.map(removeComments)
.map(splitLine)
.map(trimLine);
var currentUserAgents = [];
var isNoneUserAgentState = true;
for (var i = 0; i < lines.length; i++) {
var line = lines[i];
if (!line || !line[0]) {
continue;
}
switch (line[0].toLowerCase()) {
case 'user-agent':
if (isNoneUserAgentState) {
currentUserAgents.length = 0;
}
if (line[1]) {
currentUserAgents.push(formatUserAgent(line[1]));
}
break;
case 'disallow':
robots.addRule(currentUserAgents, line[1], false, i + 1);
break;
case 'allow':
robots.addRule(currentUserAgents, line[1], true, i + 1);
break;
case 'crawl-delay':
robots.setCrawlDelay(currentUserAgents, line[1]);
break;
case 'sitemap':
if (line[1]) {
robots.addSitemap(line[1]);
}
break;
case 'host':
if (line[1]) {
robots.setPreferredHost(line[1].toLowerCase());
}
break;
}
isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent';
}
}
/**
* Returns if a pattern is allowed by the specified rules.
*
* @param {string} path
* @param {Array.<Object>} rules
* @return {Object?}
* @private
*/
function findRule(path, rules) {
var matchedRule = null;
for (var i = 0; i < rules.length; i++) {
var rule = rules[i];
if (!matches(rule.pattern, path)) {
continue;
}
// The longest matching rule takes precedence
// If rules are the same length then allow takes precedence
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
matchedRule = rule;
} else if (
rule.pattern.length == matchedRule.pattern.length &&
rule.allow &&
!matchedRule.allow
) {
matchedRule = rule;
}
}
return matchedRule;
}
/**
* Converts provided string into an URL object.
*
* Will return null if provided string is not a valid URL.
*
* @param {string} url
* @return {?URL}
* @private
*/
function parseUrl(url) {
try {
// Specify a URL to be used with relative paths
// Using non-existent subdomain so can never cause conflict unless
// trying to crawl it but doesn't exist and even if tried worst that can
// happen is it allows relative URLs on it.
var url = new URL(url, 'http://robots-relative.samclarke.com/');
if (!url.port) {
url.port = url.protocol === 'https:' ? 443 : 80;
}
return url;
} catch (e) {
return null;
}
}
function Robots(url, contents) {
this._url = parseUrl(url) || {};
this._rules = Object.create(null);
this._sitemaps = [];
this._preferredHost = null;
parseRobots(contents || '', this);
}
/**
* Adds the specified allow/deny rule to the rules
* for the specified user-agents.
*
* @param {Array.<string>} userAgents
* @param {string} pattern
* @param {boolean} allow
* @param {number} [lineNumber] Should use 1-based indexing
*/
Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) {
var rules = this._rules;
userAgents.forEach(function (userAgent) {
rules[userAgent] = rules[userAgent] || [];
if (!pattern) {
return;
}
rules[userAgent].push({
pattern: normaliseEncoding(pattern),
allow: allow,
lineNumber: lineNumber
});
});
};
/**
* Adds the specified delay to the specified user agents.
*
* @param {Array.<string>} userAgents
* @param {string} delayStr
*/
Robots.prototype.setCrawlDelay = function (userAgents, delayStr) {
var rules = this._rules;
var delay = Number(delayStr);
userAgents.forEach(function (userAgent) {
rules[userAgent] = rules[userAgent] || [];
if (isNaN(delay)) {
return;
}
rules[userAgent].crawlDelay = delay;
});
};
/**
* Add a sitemap
*
* @param {string} url
*/
Robots.prototype.addSitemap = function (url) {
this._sitemaps.push(url);
};
/**
* Sets the preferred host name
*
* @param {string} url
*/
Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};
Robots.prototype._getRule = function (url, ua, explicit) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');
// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*'];
}
rules = rules || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);
return rule;
};
/**
* Returns true if allowed, false if not allowed.
*
* Will return undefined if the URL is not valid for
* this robots.txt file.
*
* @param {string} url
* @param {string?} ua
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua, false);
if (typeof rule === 'undefined') {
return;
}
return !rule || rule.allow;
};
/**
* Returns the line number of the matching directive for the specified
* URL and user-agent if any.
*
* The line numbers start at 1 and go up (1-based indexing).
*
* Return -1 if there is no matching directive. If a rule is manually
* added without a lineNumber then this will return undefined for that
* rule.
*
* @param {string} url
* @param {string?} ua
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua, false);
return rule ? rule.lineNumber : -1;
};
/**
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};
/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
*
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function (url, ua) {
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return;
}
return !(!rule || rule.allow);
};
/**
* Gets the crawl delay if there is one.
*
* Will return undefined if there is no crawl delay set.
*
* @param {string} ua
* @return {number?}
*/
Robots.prototype.getCrawlDelay = function (ua) {
var userAgent = formatUserAgent(ua || '*');
return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay;
};
/**
* Returns the preferred host if there is one.
*
* @return {string?}
*/
Robots.prototype.getPreferredHost = function () {
return this._preferredHost;
};
/**
* Returns an array of sitemap URLs if there are any.
*
* @return {Array.<string>}
*/
Robots.prototype.getSitemaps = function () {
return this._sitemaps.slice(0);
};
return Robots;
})();
if (typeof module !== 'undefined' && module.exports) {
module.exports = Robots;
}

View file

@ -140,10 +140,18 @@ HopObject.prototype.onRequest = function() {
}
}
// Set up layout handler and skin path
HopObject.confirmConstructor(Layout);
res.handlers.layout = res.handlers.site.layout || new Layout;
res.skinpath = res.handlers.layout.getSkinPath();
if (res.handlers.site.enforceRobotsTxt()) {
res.status = 403
res.data.error = gettext('The <a href="{0}">robots.txt</a> file disallows access to this page.', res.handlers.site.href('robots.txt'));
root.error_action();
res.stop();
}
if (!this.getPermission(req.action)) {
if (!session.user) {
User.setLocation(root.href() + req.path);

View file

@ -143,6 +143,22 @@
</div>
</div>
<div class='uk-form-row'>
<label class='uk-form-label' for='trollFilter'>
<% gettext 'Robot rules' %>
</label>
<div class='uk-form-controls'>
<label>
<% site.checkbox robotsTxtMode %>
<% gettext enforced %>
</label>
<p class="uk-form-help-block">
<% gettext 'Edit the rules in the <a href="{0}Site/robots/edit">robots.txt</a> skin.' <% site.layout.skins.href %> %>
</p>
</div>
</div>
<div class='uk-form-row'>
<label class='uk-form-label' for='trollFilter'>
<% gettext 'Troll Filter' %>

View file

@ -39,6 +39,7 @@ this.handleMetadata('notificationMode');
this.handleMetadata('notified');
this.handleMetadata('pageSize');
this.handleMetadata('pageMode');
this.handleMetadata('robotsTxtMode');
this.handleMetadata('spamfilter');
this.handleMetadata('tagline');
this.handleMetadata('timeZone');
@ -46,7 +47,7 @@ this.handleMetadata('title');
this.handleMetadata('trollFilter');
/**
* Ffunction
* @function
* @returns {String[]}
* @see defineConstants
*/
@ -94,6 +95,13 @@ Site.getNotificationModes = defineConstants(Site, markgettext('Nobody'),
*/
Site.getCallbackModes = defineConstants(Site, markgettext('disabled'),
markgettext('enabled'));
/**
* @function
* @returns {String[]}
* @see defineConstants
*/
Site.getRobotsTxtModes = defineConstants(Site, markgettext('suggest'),
markgettext('enforce'));
/**
* @param {String} name A unique identifier also used in the URL of a site
@ -132,6 +140,7 @@ Site.add = function(data, user) {
configured: now,
created: now,
creator: user,
robotsTxtMode: Site.SUGGEST,
modified: now,
modifier: user,
status: user.status === User.PRIVILEGED ? Site.TRUSTED : user.status,
@ -367,6 +376,8 @@ Site.prototype.getFormOptions = function(name) {
switch (name) {
case 'archiveMode':
return Site.getArchiveModes();
case 'callbackMode':
return Site.getCallbackModes();
case 'commentMode':
return Site.getCommentModes();
case 'locale':
@ -379,12 +390,12 @@ Site.prototype.getFormOptions = function(name) {
return Site.getNotificationModes();
case 'pageMode':
return Site.getPageModes();
case 'robotsTxtMode':
return Site.getRobotsTxtModes();
case 'status':
return Site.getStatus();
case 'timeZone':
return getTimeZones(this.getLocale());
case 'callbackMode':
return Site.getCallbackModes();
default:
return HopObject.prototype.getFormOptions.apply(this, arguments);
}
@ -441,8 +452,9 @@ Site.prototype.update = function(data) {
archiveMode: data.archiveMode || Site.CLOSED,
callbackMode: data.callbackMode || Site.DISABLED,
callbackUrl: data.callbackUrl || this.callbackUrl || String.EMPTY,
imageDimensionLimits: [data.maxImageWidth, data.maxImageHeight],
commentMode: data.commentMode || Site.DISABLED,
robotsTxtMode: data.robotsTxtMode || Site.RELAXED,
imageDimensionLimits: [data.maxImageWidth, data.maxImageHeight],
locale: data.locale || root.getLocale().toString(),
mode: data.mode || Site.CLOSED,
notificationMode: data.notificationMode || Site.NOBODY,
@ -477,7 +489,8 @@ Site.prototype.main_css_action = function() {
res.push();
this.renderSkin('$Site#stylesheet');
this.renderSkin('Site#stylesheet');
var css = res.pop();
var css = res.pop()
.replace(/<(\/?style|!).*/g, ''); // TODO: Actually, a compatibility fix (earlier CSS skins contained the <style> element)
try {
lessParser.parse(css, function(error, less) {
@ -1124,3 +1137,28 @@ Site.prototype.callback = function(ref) {
}
return;
}
Site.prototype.enforceRobotsTxt = function() {
if (this.robotsTxtMode !== Site.ENFORCE) {
return false;
}
// Override some URLs to prevent a site from becoming inaccessible even for the owner
const overrides = [
this.href('edit'),
this.href('main.css'),
this.href('main.js'),
this.href('robots.txt'),
this.layout.href(),
this.members.href()
];
const robotsTxt = root.renderSkinAsString('Site#robots');
const robots = new Robots(this.href('robots.txt'), robotsTxt);
const href = path.href(req.action);
const fullUrl = (href.includes('://') ? '' : this.href()) + href.slice(1);
return !overrides.some(href => fullUrl.includes(href))
&& !robots.isAllowed(fullUrl, req.getHeader('user-agent'));
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -125,6 +125,7 @@ global.messages['de'] = {
"Edit Poll": "Umfrage bearbeiten",
"Edit Story": "Beitrag bearbeiten",
"Edit the filter in the site settings.": "Der Filter kann in den Einstellungen bearbeitet werden.",
"Edit the rules in the <a href=\"{0}Site/robots/edit\">robots.txt</a> skin.": "Bearbeiten Sie die Regeln im <a href=\"{0}Site/robots/edit\">robots.txt</a>-Skin.",
"Edit {0}.{1}": "{0}.{1} bearbeiten",
"Enabled": "Aktiviert",
"Enter one filter {0}pattern{1} per line to be applied on every URL in the referrer and backlink lists.": "Geben Sie ein {0}Filter-Schema{1} pro Zeile ein, das für jede Adresse in den Rückverweis-Listen angewendet werden soll.",
@ -290,6 +291,7 @@ global.messages['de'] = {
"Resource type (e.g. Story or Comment)": "Art der Ressource (z.B. Beitrag oder Kommentar)",
"Restricted": "Eingeschränkt",
"Results": "Ergebnis",
"Robot rules": "Regeln für Robots",
"Role": "Rolle",
"Running": "Laufende",
"Running Polls": "Laufende Umfragen",
@ -359,6 +361,7 @@ global.messages['de'] = {
"Terms and Conditions": "Nutzungsbedingungen",
"Text": "Text",
"Thanks, your vote was registered. You can change your mind until the poll is closed.": "Danke, Ihre Stimme wurde gezählt. Bis die Umfrage beendet ist, können Sie Ihre Meinung jederzeit ändern.",
"The <a href=\"{0}\">robots.txt</a> file disallows access to this page.": "Die <a href=\"{0}\">robots.txt</a>-Datei verbietet den Zugriff auf diese Seite.",
"The Management": "Die Direktion",
"The URL endpoint for each of these APIs is located at": "Die Internet-Adresse für jede dieser Schnittstellen lautet",
"The account data will be available for download from here within the next days.": "Die Kontodaten stehen demnächst hier zum Download bereit.",
@ -529,6 +532,8 @@ global.messages['de'] = {
"e-mail": "E-Mail",
"e.g. {0}": "z.B. {0}",
"enabled": "aktiviert",
"enforce": "erzwingen",
"enforced": "erzwingen",
"export": "Exportieren",
"featured": "sichtbar",
"file": "Datei",
@ -578,6 +583,7 @@ global.messages['de'] = {
"soon": "in Kürze",
"stories": "Beiträge",
"story": "Beitrag",
"suggest": "vorschlagen",
"tag": "Stichwort",
"tags": "Stichworte",
"tomorrow": "morgen",

912
tests/robots.js Normal file
View file

@ -0,0 +1,912 @@
// Unit tests of the robots parser
// Source: <https://github.com/samclarke/robots-parser/blob/master/test/Robots.js>
// Copyright (c) 2014 Sam Clarke
// MIT License (MIT)
// Run with `npx nyc --reporter=text-summary --reporter=html --reporter=lcovonly mocha tests/robots.js`
// Set up the test environment with Antvilles version of the robots parser
const Robots = require('../code/Global/Robots.js');
const robotsParser = (url, contents) => new Robots(url, contents);
const { expect } = require('chai');
function testRobots(url, contents, allowed, disallowed) {
var robots = robotsParser(url, contents);
allowed.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(true);
});
disallowed.forEach(function (url) {
expect(robots.isDisallowed(url)).to.equal(true);
});
}
describe('Robots', function () {
it('should parse the disallow directive', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse the allow directive', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html',
'Allow: /fish/test.html',
'Allow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/fish/test.html',
'http://www.example.com/Test.html',
'http://www.example.com/test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse patterns', function () {
var contents = [
'User-agent: *',
'Disallow: /fish*.php',
'Disallow: /*.dext$',
'Disallow: /dir*'
].join('\n');
var allowed = [
'http://www.example.com/Fish.PHP',
'http://www.example.com/Fish.dext1',
'http://www.example.com/folder/dir.html',
'http://www.example.com/folder/dir/test.html'
];
var disallowed = [
'http://www.example.com/fish.php',
'http://www.example.com/fishheads/catfish.php?parameters',
'http://www.example.com/AnYthInG.dext',
'http://www.example.com/Fish.dext.dext',
'http://www.example.com/dir/test.html',
'http://www.example.com/directory.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should have the correct order precedence for allow and disallow', function () {
var contents = [
'User-agent: *',
'Disallow: /fish*.php',
'Allow: /fish/index.php',
'Disallow: /test',
'Allow: /test/',
'Disallow: /aa/',
'Allow: /aa/',
'Allow: /bb/',
'Disallow: /bb/',
].join('\n');
var allowed = [
'http://www.example.com/test/index.html',
'http://www.example.com/fish/index.php',
'http://www.example.com/test/',
'http://www.example.com/aa/',
'http://www.example.com/bb/',
'http://www.example.com/x/'
];
var disallowed = [
'http://www.example.com/fish.php',
'http://www.example.com/fishheads/catfish.php?parameters',
'http://www.example.com/test'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should have the correct order precedence for wildcards', function () {
var contents = [
'User-agent: *',
'Disallow: /*/',
'Allow: /x/',
].join('\n');
var allowed = [
'http://www.example.com/x/',
'http://www.example.com/fish.php',
'http://www.example.com/test'
];
var disallowed = [
'http://www.example.com/a/',
'http://www.example.com/xx/',
'http://www.example.com/test/index.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by \\r\\n', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\r\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should parse lines delimitated by mixed line endings', function () {
var contents = [
'User-agent: *\r',
'Disallow: /fish/\r\n',
'Disallow: /test.html\n\n'
].join('');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore rules that are not in a group', function () {
var contents = [
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.example.com/secret.html',
'http://www.example.com/test/index.html',
'http://www.example.com/test/'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, []);
});
it('should ignore comments', function () {
var contents = [
'#',
'# This is a comment',
'#',
'User-agent: *',
'# This is a comment',
'Disallow: /fish/ # ignore',
'# Disallow: fish',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore invalid lines', function () {
var contents = [
'invalid line',
'User-agent: *',
'Disallow: /fish/',
':::::another invalid line:::::',
'Disallow: /test.html',
'Unknown: tule'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore empty user-agent lines', function () {
var contents = [
'User-agent:',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html',
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
var disallowed = [];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should support groups with multiple user agents (case insensitive)', function () {
var contents = [
'User-agent: agenta',
'User-agent: agentb',
'Disallow: /fish',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false);
});
it('should return undefined for invalid urls', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var invalidUrls = [
'http://example.com/secret.html',
'http://ex ample.com/secret.html',
'http://www.example.net/test/index.html',
'http://www.examsple.com/test/',
'example.com/test/',
':::::;;`\\|/.example.com/test/'
];
var robots = robotsParser('http://www.example.com/robots.txt', contents);
invalidUrls.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(undefined);
});
});
it('should handle Unicode, urlencoded and punycode URLs', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.münich.com/index.html',
'http://www.xn--mnich-kva.com/index.html',
'http://www.m%C3%BCnich.com/index.html'
];
var disallowed = [
'http://www.münich.com/secret.html',
'http://www.xn--mnich-kva.com/secret.html',
'http://www.m%C3%BCnich.com/secret.html'
];
testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed);
testRobots('http://www.xn--mnich-kva.com/robots.txt', contents, allowed, disallowed);
testRobots('http://www.m%C3%BCnich.com/robots.txt', contents, allowed, disallowed);
});
it('should handle Unicode and urlencoded paths', function () {
var contents = [
'User-agent: *',
'Disallow: /%CF%80',
'Disallow: /%e2%9d%83',
'Disallow: /%a%a',
'Disallow: /💩',
'Disallow: /✼*t$',
'Disallow: /%E2%9C%A4*t$',
'Disallow: /✿%a',
'Disallow: /http%3A%2F%2Fexample.org'
].join('\n');
var allowed = [
'http://www.example.com/✼testing',
'http://www.example.com/%E2%9C%BCtesting',
'http://www.example.com/✤testing',
'http://www.example.com/%E2%9C%A4testing',
'http://www.example.com/http://example.org',
'http://www.example.com/http:%2F%2Fexample.org'
];
var disallowed = [
'http://www.example.com/%CF%80',
'http://www.example.com/%CF%80/index.html',
'http://www.example.com/π',
'http://www.example.com/π/index.html',
'http://www.example.com/%e2%9d%83',
'http://www.example.com/%E2%9D%83/index.html',
'http://www.example.com/❃',
'http://www.example.com/❃/index.html',
'http://www.example.com/%F0%9F%92%A9',
'http://www.example.com/%F0%9F%92%A9/index.html',
'http://www.example.com/💩',
'http://www.example.com/💩/index.html',
'http://www.example.com/%a%a',
'http://www.example.com/%a%a/index.html',
'http://www.example.com/✼test',
'http://www.example.com/%E2%9C%BCtest',
'http://www.example.com/✤test',
'http://www.example.com/%E2%9C%A4testt',
'http://www.example.com/✿%a',
'http://www.example.com/%E2%9C%BF%atest',
'http://www.example.com/http%3A%2F%2Fexample.org'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should handle lone high / low surrogates', function () {
var contents = [
'User-agent: *',
'Disallow: /\uD800',
'Disallow: /\uDC00'
].join('\n');
// These are invalid so can't be disallowed
var allowed = [
'http://www.example.com/\uDC00',
'http://www.example.com/\uD800'
];
var disallowed = [];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore host case', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',
'Disallow: /test',
].join('\n');
var allowed = [
'http://www.example.com/index.html',
'http://www.ExAmPlE.com/index.html',
'http://www.EXAMPLE.com/index.html'
];
var disallowed = [
'http://www.example.com/secret.html',
'http://www.ExAmPlE.com/secret.html',
'http://www.EXAMPLE.com/secret.html'
];
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
});
it('should handle relative paths', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(true);
expect(robots.isAllowed('/fish')).to.equal(false);
});
it('should not allow relative paths if domain specified', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
expect(robots.isAllowed('/fish')).to.equal(undefined);
});
it('should not treat invalid robots.txt URLs as relative', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('https://ex ample.com/robots.txt', contents);
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
expect(robots.isAllowed('/fish')).to.equal(undefined);
});
it('should not allow URls if domain specified and robots.txt is relative', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
'Allow: /fish/test',
].join('\n');
var robots = robotsParser('/robots.txt', contents);
expect(robots.isAllowed('http://www.example.com/fish/test')).to.equal(undefined);
expect(robots.isAllowed('http://www.example.com/fish')).to.equal(undefined);
});
it('should allow all if empty robots.txt', function () {
var allowed = [
'http://www.example.com/secret.html',
'http://www.example.com/test/index.html',
'http://www.example.com/test/'
];
var robots = robotsParser('http://www.example.com/robots.txt', '');
allowed.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(true);
});
});
it('should treat null as allowing all', function () {
var robots = robotsParser('http://www.example.com/robots.txt', null);
expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true);
expect(robots.isAllowed("http://www.example.com/")).to.equal(true);
});
it('should handle invalid robots.txt urls', function () {
var contents = [
'user-agent: *',
'disallow: /',
'host: www.example.com',
'sitemap: /sitemap.xml'
].join('\n');
var sitemapUrls = [
undefined,
null,
'null',
':/wom/test/'
];
sitemapUrls.forEach(function (url) {
var robots = robotsParser(url, contents);
expect(robots.isAllowed('http://www.example.com/index.html')).to.equal(undefined);
expect(robots.getPreferredHost()).to.equal('www.example.com');
expect(robots.getSitemaps()).to.eql(['/sitemap.xml']);
});
});
it('should parse the crawl-delay directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'user-agent: b',
'disallow: /d',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('a')).to.equal(1);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
expect(robots.getCrawlDelay('c')).to.equal(10);
expect(robots.getCrawlDelay('d')).to.equal(10);
expect(robots.getCrawlDelay()).to.equal(undefined);
});
it('should ignore invalid crawl-delay directives', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1.2.1',
'user-agent: b',
'crawl-delay: 1.a0',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10a'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('a')).to.equal(undefined);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
expect(robots.getCrawlDelay('c')).to.equal(undefined);
expect(robots.getCrawlDelay('d')).to.equal(undefined);
});
it('should parse the sitemap directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'sitemap: http://example.com/test.xml',
'user-agent: b',
'disallow: /d',
'sitemap: /sitemap.xml',
'sitemap: http://example.com/test/sitemap.xml '
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getSitemaps()).to.eql([
'http://example.com/test.xml',
'/sitemap.xml',
'http://example.com/test/sitemap.xml'
]);
});
it('should parse the host directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'host: www.example.net',
'user-agent: b',
'disallow: /d',
'host: example.com'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal('example.com');
});
it('should parse empty and invalid directives', function () {
var contents = [
'user-agent:',
'user-agent:::: a::',
'crawl-delay:',
'crawl-delay:::: 0:',
'host:',
'host:: example.com',
'sitemap:',
'sitemap:: site:map.xml',
'disallow:',
'disallow::: /:',
'allow:',
'allow::: /:',
].join('\n');
robotsParser('http://www.example.com/robots.txt', contents);
});
it('should treat only the last host directive as valid', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'host: www.example.net',
'user-agent: b',
'disallow: /d',
'host: example.net',
'host: example.com'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal('example.com');
});
it('should return null when there is no host directive', function () {
var contents = [
'user-agent: a',
'crawl-delay: 1',
'user-agent: b',
'disallow: /d',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getPreferredHost()).to.equal(null);
});
it('should fallback to * when a UA has no rules of its own', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'crawl-delay: 12',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('should-fall-back')).to.equal(1);
expect(robots.getCrawlDelay('d')).to.equal(10);
expect(robots.getCrawlDelay('dd')).to.equal(1);
});
it('should not fallback to * when a UA has rules', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'disallow:'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('b')).to.equal(undefined);
});
it('should handle UAs with object property names', function () {
var contents = [
'User-agent: *',
'Disallow: /fish',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed('http://www.example.com/fish', 'constructor')).to.equal(false);
expect(robots.isAllowed('http://www.example.com/fish', '__proto__')).to.equal(false);
});
it('should ignore version numbers in the UA string', function () {
var contents = [
'user-agent: *',
'crawl-delay: 1',
'user-agent: b',
'crawl-delay: 12',
'user-agent: c',
'user-agent: d',
'crawl-delay: 10'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getCrawlDelay('should-fall-back/1.0.0')).to.equal(1);
expect(robots.getCrawlDelay('d/12')).to.equal(10);
expect(robots.getCrawlDelay('dd / 0-32-3')).to.equal(1);
expect(robots.getCrawlDelay('b / 1.0')).to.equal(12);
});
it('should return the line number of the matching directive', function () {
var contents = [
'',
'User-agent: *',
'',
'Disallow: /fish/',
'Disallow: /test.html',
'Allow: /fish/test.html',
'Allow: /test.html',
'',
'User-agent: a',
'allow: /',
'',
'User-agent: b',
'disallow: /test',
'disallow: /t*t',
'',
'User-agent: c',
'Disallow: /fish*.php',
'Allow: /fish/index.php'
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.getMatchingLineNumber('http://www.example.com/fish')).to.equal(-1);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/test.html')).to.equal(6);
expect(robots.getMatchingLineNumber('http://www.example.com/Test.html')).to.equal(-1);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php')).to.equal(4);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7);
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10);
expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17);
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18);
});
it('should handle large wildcards efficiently', function () {
var contents = [
'User-agent: *',
'Disallow: /' + '*'.repeat(2048) + '.html',
].join('\n');
var allowed = [
'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php',
];
var disallowed = [
'http://www.example.com/secret.html'
];
const start = Date.now();
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
const end = Date.now();
// Should take less than 500 ms (high to allow for variableness of
// machines running the test, should normally be much less)
expect(end - start).to.be.lessThan(500);
});
it('should honor given port number', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com:8080/fish',
'http://www.example.com:8080/Test.html'
];
var disallowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html',
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html'
];
testRobots('http://www.example.com:8080/robots.txt', contents, allowed, disallowed);
});
it('should default to port 80 for http: if no port given', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html'
];
var disallowed = [
'http://www.example.com:443/fish',
'http://www.example.com:443/Test.html',
'http://www.example.com:80/fish/index.php',
'http://www.example.com:80/fish/',
'http://www.example.com:80/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should default to port 443 for https: if no port given', function () {
var contents = [
'User-agent: *',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'https://www.example.com:443/fish',
'https://www.example.com:443/Test.html',
'https://www.example.com/fish',
'https://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com:80/fish',
'http://www.example.com:80/Test.html',
'http://www.example.com:443/fish/index.php',
'http://www.example.com:443/fish/',
'http://www.example.com:443/test.html'
];
testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
});
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
});
it('should return undefined when given an invalid URL in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')
var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser('http://example.com', contents);
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined)
});
});

View file

@ -7,3 +7,8 @@
summary {
display: revert;
}
// TODO: Actually a compatibility fix. See https://tobi.antville.org/stories/2317931/
td[nowrap][width]:not([width$="%"]) {
white-space: normal
}