Merge branch 'main' into automatically-set-root-cookie
# Conflicts: # .github/workflows/deploy.yml
This commit is contained in:
commit
a7fc1df893
10 changed files with 1905 additions and 312 deletions
1
.github/workflows/deploy.yml
vendored
1
.github/workflows/deploy.yml
vendored
|
|
@ -24,7 +24,6 @@ jobs:
|
|||
run: ./gradlew :build
|
||||
|
||||
- name: Copy files to server
|
||||
# The rsync command applies the same filters as the one in tools/extras/deploy.sh
|
||||
run: |
|
||||
rsync ./build/install/antville/ ${{ inputs.hostname }}:./apps/antville/ \
|
||||
--archive --compress --delete --verbose \
|
||||
|
|
|
|||
543
code/Global/Robots.js
Normal file
543
code/Global/Robots.js
Normal file
|
|
@ -0,0 +1,543 @@
|
|||
// Robots parser adapted for Rhino-compatible JavaScript
|
||||
// Source: <https://github.com/samclarke/robots-parser>
|
||||
// Copyright (c) 2014 Sam Clarke
|
||||
// Copyright (c) 2025 Antville.org
|
||||
// MIT License (MIT)
|
||||
|
||||
// Transformation steps:
|
||||
// 1. Add IIFE around the code
|
||||
// 2. Replace module.exports with return statement
|
||||
// 3. Add conditional module.exports for CommonJS support
|
||||
// 4. Add URL class imitation
|
||||
|
||||
var Robots = (() => {
|
||||
/**
|
||||
* Half-baked (read-only) imitation of the URL class of Node.js
|
||||
*/
|
||||
function nodeJsUrl(str, base) {
|
||||
if (!str.includes('://')) {
|
||||
str = (base || 'http://localhost') + str;
|
||||
}
|
||||
|
||||
const url = new java.net.URL(str);
|
||||
const port = url.port < 0 ? '' : url.port;
|
||||
const userInfo = (url.getUserInfo() || "").split(':');
|
||||
|
||||
return {
|
||||
hash: url.ref ? '#' + url.ref : '',
|
||||
href: url.toString(),
|
||||
host: url.host + (port ? ':' + port : port),
|
||||
hostname: url.host,
|
||||
password: userInfo[1] || "",
|
||||
pathname: url.path,
|
||||
origin: url.protocol + '://' + url.host + (port ? ':' + port : port),
|
||||
port,
|
||||
protocol: url.protocol,
|
||||
search: url.queryy ? '?' + url.query : '',
|
||||
searchParams: {
|
||||
get: () => null,
|
||||
set: () => null
|
||||
},
|
||||
username: userInfo[0] || "",
|
||||
};
|
||||
}
|
||||
|
||||
if (typeof URL === 'undefined') {
|
||||
globalThis.URL = nodeJsUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims the white space from the start and end of the line.
|
||||
*
|
||||
* If the line is an array it will strip the white space from
|
||||
* the start and end of each element of the array.
|
||||
*
|
||||
* @param {string|Array} line
|
||||
* @return {string|Array}
|
||||
* @private
|
||||
*/
|
||||
function trimLine(line) {
|
||||
if (!line) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (Array.isArray(line)) {
|
||||
return line.map(trimLine);
|
||||
}
|
||||
|
||||
return String(line).trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove comments from lines
|
||||
*
|
||||
* @param {string} line
|
||||
* @return {string}
|
||||
* @private
|
||||
*/
|
||||
function removeComments(line) {
|
||||
var commentStartIndex = line.indexOf('#');
|
||||
if (commentStartIndex > -1) {
|
||||
return line.substr(0, commentStartIndex);
|
||||
}
|
||||
|
||||
return line;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a line at the first occurrence of :
|
||||
*
|
||||
* @param {string} line
|
||||
* @return {Array.<string>}
|
||||
* @private
|
||||
*/
|
||||
function splitLine(line) {
|
||||
var idx = String(line).indexOf(':');
|
||||
|
||||
if (!line || idx < 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [line.slice(0, idx), line.slice(idx + 1)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalises the user-agent string by converting it to
|
||||
* lower case and removing any version numbers.
|
||||
*
|
||||
* @param {string} userAgent
|
||||
* @return {string}
|
||||
* @private
|
||||
*/
|
||||
function formatUserAgent(userAgent) {
|
||||
var formattedUserAgent = userAgent.toLowerCase();
|
||||
|
||||
// Strip the version number from robot/1.0 user agents
|
||||
var idx = formattedUserAgent.indexOf('/');
|
||||
if (idx > -1) {
|
||||
formattedUserAgent = formattedUserAgent.substr(0, idx);
|
||||
}
|
||||
|
||||
return formattedUserAgent.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalises the URL encoding of a path by encoding
|
||||
* unicode characters.
|
||||
*
|
||||
* @param {string} path
|
||||
* @return {string}
|
||||
* @private
|
||||
*/
|
||||
function normaliseEncoding(path) {
|
||||
try {
|
||||
return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%'));
|
||||
} catch (e) {
|
||||
return path;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert URL encodings to support case.
|
||||
*
|
||||
* e.g.: %2a%ef becomes %2A%EF
|
||||
*
|
||||
* @param {string} path
|
||||
* @return {string}
|
||||
* @private
|
||||
*/
|
||||
function urlEncodeToUpper(path) {
|
||||
return path.replace(/%[0-9a-fA-F]{2}/g, function (match) {
|
||||
return match.toUpperCase();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches a pattern with the specified path
|
||||
*
|
||||
* Uses same algorithm to match patterns as the Google implementation in
|
||||
* google/robotstxt so it should be consistent with the spec.
|
||||
*
|
||||
* @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74
|
||||
* @param {string} pattern
|
||||
* @param {string} path
|
||||
* @return {boolean}
|
||||
* @private
|
||||
*/
|
||||
function matches(pattern, path) {
|
||||
// I've added extra comments to try make this easier to understand
|
||||
|
||||
// Stores the lengths of all the current matching substrings.
|
||||
// Maximum number of possible matching lengths is every length in path plus
|
||||
// 1 to handle 0 length too (if pattern starts with * which is zero or more)
|
||||
var matchingLengths = new Array(path.length + 1);
|
||||
var numMatchingLengths = 1;
|
||||
|
||||
// Initially longest match is 0
|
||||
matchingLengths[0] = 0;
|
||||
|
||||
for (var p = 0; p < pattern.length; p++) {
|
||||
// If $ is at the end of pattern then we must match the whole path.
|
||||
// Which is true if the longest matching length matches path length
|
||||
if (pattern[p] === '$' && p + 1 === pattern.length) {
|
||||
return matchingLengths[numMatchingLengths - 1] === path.length;
|
||||
}
|
||||
|
||||
// Handle wildcards
|
||||
if (pattern[p] == '*') {
|
||||
// Wildcard so all substrings minus the current smallest matching
|
||||
// length are matches
|
||||
numMatchingLengths = path.length - matchingLengths[0] + 1;
|
||||
|
||||
// Update matching lengths to include the smallest all the way up
|
||||
// to numMatchingLengths
|
||||
// Don't update smallest possible match as * matches zero or more
|
||||
// so the smallest current match is also valid
|
||||
for (var i = 1; i < numMatchingLengths; i++) {
|
||||
matchingLengths[i] = matchingLengths[i - 1] + 1;
|
||||
}
|
||||
} else {
|
||||
// Check the char at the matching length matches the pattern, if it
|
||||
// does increment it and add it as a valid length, ignore if not.
|
||||
var numMatches = 0;
|
||||
for (var i = 0; i < numMatchingLengths; i++) {
|
||||
if (
|
||||
matchingLengths[i] < path.length &&
|
||||
path[matchingLengths[i]] === pattern[p]
|
||||
) {
|
||||
matchingLengths[numMatches++] = matchingLengths[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// No paths matched the current pattern char so not a match
|
||||
if (numMatches == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
numMatchingLengths = numMatches;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
function parseRobots(contents, robots) {
|
||||
var newlineRegex = /\r\n|\r|\n/;
|
||||
var lines = contents
|
||||
.split(newlineRegex)
|
||||
.map(removeComments)
|
||||
.map(splitLine)
|
||||
.map(trimLine);
|
||||
|
||||
var currentUserAgents = [];
|
||||
var isNoneUserAgentState = true;
|
||||
for (var i = 0; i < lines.length; i++) {
|
||||
var line = lines[i];
|
||||
|
||||
if (!line || !line[0]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (line[0].toLowerCase()) {
|
||||
case 'user-agent':
|
||||
if (isNoneUserAgentState) {
|
||||
currentUserAgents.length = 0;
|
||||
}
|
||||
|
||||
if (line[1]) {
|
||||
currentUserAgents.push(formatUserAgent(line[1]));
|
||||
}
|
||||
break;
|
||||
case 'disallow':
|
||||
robots.addRule(currentUserAgents, line[1], false, i + 1);
|
||||
break;
|
||||
case 'allow':
|
||||
robots.addRule(currentUserAgents, line[1], true, i + 1);
|
||||
break;
|
||||
case 'crawl-delay':
|
||||
robots.setCrawlDelay(currentUserAgents, line[1]);
|
||||
break;
|
||||
case 'sitemap':
|
||||
if (line[1]) {
|
||||
robots.addSitemap(line[1]);
|
||||
}
|
||||
break;
|
||||
case 'host':
|
||||
if (line[1]) {
|
||||
robots.setPreferredHost(line[1].toLowerCase());
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns if a pattern is allowed by the specified rules.
|
||||
*
|
||||
* @param {string} path
|
||||
* @param {Array.<Object>} rules
|
||||
* @return {Object?}
|
||||
* @private
|
||||
*/
|
||||
function findRule(path, rules) {
|
||||
var matchedRule = null;
|
||||
|
||||
for (var i = 0; i < rules.length; i++) {
|
||||
var rule = rules[i];
|
||||
|
||||
if (!matches(rule.pattern, path)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// The longest matching rule takes precedence
|
||||
// If rules are the same length then allow takes precedence
|
||||
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
|
||||
matchedRule = rule;
|
||||
} else if (
|
||||
rule.pattern.length == matchedRule.pattern.length &&
|
||||
rule.allow &&
|
||||
!matchedRule.allow
|
||||
) {
|
||||
matchedRule = rule;
|
||||
}
|
||||
}
|
||||
|
||||
return matchedRule;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts provided string into an URL object.
|
||||
*
|
||||
* Will return null if provided string is not a valid URL.
|
||||
*
|
||||
* @param {string} url
|
||||
* @return {?URL}
|
||||
* @private
|
||||
*/
|
||||
function parseUrl(url) {
|
||||
try {
|
||||
// Specify a URL to be used with relative paths
|
||||
// Using non-existent subdomain so can never cause conflict unless
|
||||
// trying to crawl it but doesn't exist and even if tried worst that can
|
||||
// happen is it allows relative URLs on it.
|
||||
var url = new URL(url, 'http://robots-relative.samclarke.com/');
|
||||
|
||||
if (!url.port) {
|
||||
url.port = url.protocol === 'https:' ? 443 : 80;
|
||||
}
|
||||
|
||||
return url;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function Robots(url, contents) {
|
||||
this._url = parseUrl(url) || {};
|
||||
this._rules = Object.create(null);
|
||||
this._sitemaps = [];
|
||||
this._preferredHost = null;
|
||||
|
||||
parseRobots(contents || '', this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified allow/deny rule to the rules
|
||||
* for the specified user-agents.
|
||||
*
|
||||
* @param {Array.<string>} userAgents
|
||||
* @param {string} pattern
|
||||
* @param {boolean} allow
|
||||
* @param {number} [lineNumber] Should use 1-based indexing
|
||||
*/
|
||||
Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) {
|
||||
var rules = this._rules;
|
||||
|
||||
userAgents.forEach(function (userAgent) {
|
||||
rules[userAgent] = rules[userAgent] || [];
|
||||
|
||||
if (!pattern) {
|
||||
return;
|
||||
}
|
||||
|
||||
rules[userAgent].push({
|
||||
pattern: normaliseEncoding(pattern),
|
||||
allow: allow,
|
||||
lineNumber: lineNumber
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Adds the specified delay to the specified user agents.
|
||||
*
|
||||
* @param {Array.<string>} userAgents
|
||||
* @param {string} delayStr
|
||||
*/
|
||||
Robots.prototype.setCrawlDelay = function (userAgents, delayStr) {
|
||||
var rules = this._rules;
|
||||
var delay = Number(delayStr);
|
||||
|
||||
userAgents.forEach(function (userAgent) {
|
||||
rules[userAgent] = rules[userAgent] || [];
|
||||
|
||||
if (isNaN(delay)) {
|
||||
return;
|
||||
}
|
||||
|
||||
rules[userAgent].crawlDelay = delay;
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Add a sitemap
|
||||
*
|
||||
* @param {string} url
|
||||
*/
|
||||
Robots.prototype.addSitemap = function (url) {
|
||||
this._sitemaps.push(url);
|
||||
};
|
||||
|
||||
/**
|
||||
* Sets the preferred host name
|
||||
*
|
||||
* @param {string} url
|
||||
*/
|
||||
Robots.prototype.setPreferredHost = function (url) {
|
||||
this._preferredHost = url;
|
||||
};
|
||||
|
||||
Robots.prototype._getRule = function (url, ua, explicit) {
|
||||
var parsedUrl = parseUrl(url) || {};
|
||||
var userAgent = formatUserAgent(ua || '*');
|
||||
|
||||
// The base URL must match otherwise this robots.txt is not valid for it.
|
||||
if (
|
||||
parsedUrl.protocol !== this._url.protocol ||
|
||||
parsedUrl.hostname !== this._url.hostname ||
|
||||
parsedUrl.port !== this._url.port
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
var rules = this._rules[userAgent];
|
||||
if (!explicit) {
|
||||
rules = rules || this._rules['*'];
|
||||
}
|
||||
rules = rules || [];
|
||||
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
|
||||
var rule = findRule(path, rules);
|
||||
|
||||
return rule;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns true if allowed, false if not allowed.
|
||||
*
|
||||
* Will return undefined if the URL is not valid for
|
||||
* this robots.txt file.
|
||||
*
|
||||
* @param {string} url
|
||||
* @param {string?} ua
|
||||
* @return {boolean?}
|
||||
*/
|
||||
Robots.prototype.isAllowed = function (url, ua) {
|
||||
var rule = this._getRule(url, ua, false);
|
||||
|
||||
if (typeof rule === 'undefined') {
|
||||
return;
|
||||
}
|
||||
|
||||
return !rule || rule.allow;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the line number of the matching directive for the specified
|
||||
* URL and user-agent if any.
|
||||
*
|
||||
* The line numbers start at 1 and go up (1-based indexing).
|
||||
*
|
||||
* Return -1 if there is no matching directive. If a rule is manually
|
||||
* added without a lineNumber then this will return undefined for that
|
||||
* rule.
|
||||
*
|
||||
* @param {string} url
|
||||
* @param {string?} ua
|
||||
* @return {number?}
|
||||
*/
|
||||
Robots.prototype.getMatchingLineNumber = function (url, ua) {
|
||||
var rule = this._getRule(url, ua, false);
|
||||
|
||||
return rule ? rule.lineNumber : -1;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the opposite of isAllowed()
|
||||
*
|
||||
* @param {string} url
|
||||
* @param {string?} ua
|
||||
* @return {boolean}
|
||||
*/
|
||||
Robots.prototype.isDisallowed = function (url, ua) {
|
||||
return !this.isAllowed(url, ua);
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns trues if explicitly disallowed
|
||||
* for the specified user agent (User Agent wildcards are discarded).
|
||||
*
|
||||
* This will return undefined if the URL is not valid for this robots.txt file.
|
||||
*
|
||||
* @param {string} url
|
||||
* @param {string} ua
|
||||
* @return {boolean?}
|
||||
*/
|
||||
Robots.prototype.isExplicitlyDisallowed = function (url, ua) {
|
||||
var rule = this._getRule(url, ua, true);
|
||||
if (typeof rule === 'undefined') {
|
||||
return;
|
||||
}
|
||||
|
||||
return !(!rule || rule.allow);
|
||||
};
|
||||
|
||||
/**
|
||||
* Gets the crawl delay if there is one.
|
||||
*
|
||||
* Will return undefined if there is no crawl delay set.
|
||||
*
|
||||
* @param {string} ua
|
||||
* @return {number?}
|
||||
*/
|
||||
Robots.prototype.getCrawlDelay = function (ua) {
|
||||
var userAgent = formatUserAgent(ua || '*');
|
||||
|
||||
return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the preferred host if there is one.
|
||||
*
|
||||
* @return {string?}
|
||||
*/
|
||||
Robots.prototype.getPreferredHost = function () {
|
||||
return this._preferredHost;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns an array of sitemap URLs if there are any.
|
||||
*
|
||||
* @return {Array.<string>}
|
||||
*/
|
||||
Robots.prototype.getSitemaps = function () {
|
||||
return this._sitemaps.slice(0);
|
||||
};
|
||||
|
||||
return Robots;
|
||||
})();
|
||||
|
||||
if (typeof module !== 'undefined' && module.exports) {
|
||||
module.exports = Robots;
|
||||
}
|
||||
|
|
@ -140,10 +140,18 @@ HopObject.prototype.onRequest = function() {
|
|||
}
|
||||
}
|
||||
|
||||
// Set up layout handler and skin path
|
||||
HopObject.confirmConstructor(Layout);
|
||||
res.handlers.layout = res.handlers.site.layout || new Layout;
|
||||
res.skinpath = res.handlers.layout.getSkinPath();
|
||||
|
||||
if (res.handlers.site.enforceRobotsTxt()) {
|
||||
res.status = 403
|
||||
res.data.error = gettext('The <a href="{0}">robots.txt</a> file disallows access to this page.', res.handlers.site.href('robots.txt'));
|
||||
root.error_action();
|
||||
res.stop();
|
||||
}
|
||||
|
||||
if (!this.getPermission(req.action)) {
|
||||
if (!session.user) {
|
||||
User.setLocation(root.href() + req.path);
|
||||
|
|
|
|||
|
|
@ -143,6 +143,22 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div class='uk-form-row'>
|
||||
<label class='uk-form-label' for='trollFilter'>
|
||||
<% gettext 'Robot rules' %>
|
||||
</label>
|
||||
|
||||
<div class='uk-form-controls'>
|
||||
<label>
|
||||
<% site.checkbox robotsTxtMode %>
|
||||
<% gettext enforced %>
|
||||
</label>
|
||||
<p class="uk-form-help-block">
|
||||
<% gettext 'Edit the rules in the <a href="{0}Site/robots/edit">robots.txt</a> skin.' <% site.layout.skins.href %> %>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class='uk-form-row'>
|
||||
<label class='uk-form-label' for='trollFilter'>
|
||||
<% gettext 'Troll Filter' %>
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ this.handleMetadata('notificationMode');
|
|||
this.handleMetadata('notified');
|
||||
this.handleMetadata('pageSize');
|
||||
this.handleMetadata('pageMode');
|
||||
this.handleMetadata('robotsTxtMode');
|
||||
this.handleMetadata('spamfilter');
|
||||
this.handleMetadata('tagline');
|
||||
this.handleMetadata('timeZone');
|
||||
|
|
@ -46,7 +47,7 @@ this.handleMetadata('title');
|
|||
this.handleMetadata('trollFilter');
|
||||
|
||||
/**
|
||||
* Ffunction
|
||||
* @function
|
||||
* @returns {String[]}
|
||||
* @see defineConstants
|
||||
*/
|
||||
|
|
@ -94,6 +95,13 @@ Site.getNotificationModes = defineConstants(Site, markgettext('Nobody'),
|
|||
*/
|
||||
Site.getCallbackModes = defineConstants(Site, markgettext('disabled'),
|
||||
markgettext('enabled'));
|
||||
/**
|
||||
* @function
|
||||
* @returns {String[]}
|
||||
* @see defineConstants
|
||||
*/
|
||||
Site.getRobotsTxtModes = defineConstants(Site, markgettext('suggest'),
|
||||
markgettext('enforce'));
|
||||
|
||||
/**
|
||||
* @param {String} name A unique identifier also used in the URL of a site
|
||||
|
|
@ -132,6 +140,7 @@ Site.add = function(data, user) {
|
|||
configured: now,
|
||||
created: now,
|
||||
creator: user,
|
||||
robotsTxtMode: Site.SUGGEST,
|
||||
modified: now,
|
||||
modifier: user,
|
||||
status: user.status === User.PRIVILEGED ? Site.TRUSTED : user.status,
|
||||
|
|
@ -367,6 +376,8 @@ Site.prototype.getFormOptions = function(name) {
|
|||
switch (name) {
|
||||
case 'archiveMode':
|
||||
return Site.getArchiveModes();
|
||||
case 'callbackMode':
|
||||
return Site.getCallbackModes();
|
||||
case 'commentMode':
|
||||
return Site.getCommentModes();
|
||||
case 'locale':
|
||||
|
|
@ -379,12 +390,12 @@ Site.prototype.getFormOptions = function(name) {
|
|||
return Site.getNotificationModes();
|
||||
case 'pageMode':
|
||||
return Site.getPageModes();
|
||||
case 'robotsTxtMode':
|
||||
return Site.getRobotsTxtModes();
|
||||
case 'status':
|
||||
return Site.getStatus();
|
||||
case 'timeZone':
|
||||
return getTimeZones(this.getLocale());
|
||||
case 'callbackMode':
|
||||
return Site.getCallbackModes();
|
||||
default:
|
||||
return HopObject.prototype.getFormOptions.apply(this, arguments);
|
||||
}
|
||||
|
|
@ -441,8 +452,9 @@ Site.prototype.update = function(data) {
|
|||
archiveMode: data.archiveMode || Site.CLOSED,
|
||||
callbackMode: data.callbackMode || Site.DISABLED,
|
||||
callbackUrl: data.callbackUrl || this.callbackUrl || String.EMPTY,
|
||||
imageDimensionLimits: [data.maxImageWidth, data.maxImageHeight],
|
||||
commentMode: data.commentMode || Site.DISABLED,
|
||||
robotsTxtMode: data.robotsTxtMode || Site.RELAXED,
|
||||
imageDimensionLimits: [data.maxImageWidth, data.maxImageHeight],
|
||||
locale: data.locale || root.getLocale().toString(),
|
||||
mode: data.mode || Site.CLOSED,
|
||||
notificationMode: data.notificationMode || Site.NOBODY,
|
||||
|
|
@ -477,7 +489,8 @@ Site.prototype.main_css_action = function() {
|
|||
res.push();
|
||||
this.renderSkin('$Site#stylesheet');
|
||||
this.renderSkin('Site#stylesheet');
|
||||
var css = res.pop();
|
||||
var css = res.pop()
|
||||
.replace(/<(\/?style|!).*/g, ''); // TODO: Actually, a compatibility fix (earlier CSS skins contained the <style> element)
|
||||
|
||||
try {
|
||||
lessParser.parse(css, function(error, less) {
|
||||
|
|
@ -1124,3 +1137,28 @@ Site.prototype.callback = function(ref) {
|
|||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Site.prototype.enforceRobotsTxt = function() {
|
||||
if (this.robotsTxtMode !== Site.ENFORCE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Override some URLs to prevent a site from becoming inaccessible even for the owner
|
||||
const overrides = [
|
||||
this.href('edit'),
|
||||
this.href('main.css'),
|
||||
this.href('main.js'),
|
||||
this.href('robots.txt'),
|
||||
this.layout.href(),
|
||||
this.members.href()
|
||||
];
|
||||
|
||||
const robotsTxt = root.renderSkinAsString('Site#robots');
|
||||
const robots = new Robots(this.href('robots.txt'), robotsTxt);
|
||||
|
||||
const href = path.href(req.action);
|
||||
const fullUrl = (href.includes('://') ? '' : this.href()) + href.slice(1);
|
||||
|
||||
return !overrides.some(href => fullUrl.includes(href))
|
||||
&& !robots.isAllowed(fullUrl, req.getHeader('user-agent'));
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
344
i18n/de.po
344
i18n/de.po
File diff suppressed because it is too large
Load diff
|
|
@ -125,6 +125,7 @@ global.messages['de'] = {
|
|||
"Edit Poll": "Umfrage bearbeiten",
|
||||
"Edit Story": "Beitrag bearbeiten",
|
||||
"Edit the filter in the site settings.": "Der Filter kann in den Einstellungen bearbeitet werden.",
|
||||
"Edit the rules in the <a href=\"{0}Site/robots/edit\">robots.txt</a> skin.": "Bearbeiten Sie die Regeln im <a href=\"{0}Site/robots/edit\">robots.txt</a>-Skin.",
|
||||
"Edit {0}.{1}": "{0}.{1} bearbeiten",
|
||||
"Enabled": "Aktiviert",
|
||||
"Enter one filter {0}pattern{1} per line to be applied on every URL in the referrer and backlink lists.": "Geben Sie ein {0}Filter-Schema{1} pro Zeile ein, das für jede Adresse in den Rückverweis-Listen angewendet werden soll.",
|
||||
|
|
@ -290,6 +291,7 @@ global.messages['de'] = {
|
|||
"Resource type (e.g. Story or Comment)": "Art der Ressource (z.B. Beitrag oder Kommentar)",
|
||||
"Restricted": "Eingeschränkt",
|
||||
"Results": "Ergebnis",
|
||||
"Robot rules": "Regeln für Robots",
|
||||
"Role": "Rolle",
|
||||
"Running": "Laufende",
|
||||
"Running Polls": "Laufende Umfragen",
|
||||
|
|
@ -359,6 +361,7 @@ global.messages['de'] = {
|
|||
"Terms and Conditions": "Nutzungsbedingungen",
|
||||
"Text": "Text",
|
||||
"Thanks, your vote was registered. You can change your mind until the poll is closed.": "Danke, Ihre Stimme wurde gezählt. Bis die Umfrage beendet ist, können Sie Ihre Meinung jederzeit ändern.",
|
||||
"The <a href=\"{0}\">robots.txt</a> file disallows access to this page.": "Die <a href=\"{0}\">robots.txt</a>-Datei verbietet den Zugriff auf diese Seite.",
|
||||
"The Management": "Die Direktion",
|
||||
"The URL endpoint for each of these APIs is located at": "Die Internet-Adresse für jede dieser Schnittstellen lautet",
|
||||
"The account data will be available for download from here within the next days.": "Die Kontodaten stehen demnächst hier zum Download bereit.",
|
||||
|
|
@ -529,6 +532,8 @@ global.messages['de'] = {
|
|||
"e-mail": "E-Mail",
|
||||
"e.g. {0}": "z.B. {0}",
|
||||
"enabled": "aktiviert",
|
||||
"enforce": "erzwingen",
|
||||
"enforced": "erzwingen",
|
||||
"export": "Exportieren",
|
||||
"featured": "sichtbar",
|
||||
"file": "Datei",
|
||||
|
|
@ -578,6 +583,7 @@ global.messages['de'] = {
|
|||
"soon": "in Kürze",
|
||||
"stories": "Beiträge",
|
||||
"story": "Beitrag",
|
||||
"suggest": "vorschlagen",
|
||||
"tag": "Stichwort",
|
||||
"tags": "Stichworte",
|
||||
"tomorrow": "morgen",
|
||||
|
|
|
|||
912
tests/robots.js
Normal file
912
tests/robots.js
Normal file
|
|
@ -0,0 +1,912 @@
|
|||
// Unit tests of the robots parser
|
||||
// Source: <https://github.com/samclarke/robots-parser/blob/master/test/Robots.js>
|
||||
// Copyright (c) 2014 Sam Clarke
|
||||
// MIT License (MIT)
|
||||
|
||||
// Run with `npx nyc --reporter=text-summary --reporter=html --reporter=lcovonly mocha tests/robots.js`
|
||||
|
||||
// Set up the test environment with Antville’s version of the robots parser
|
||||
const Robots = require('../code/Global/Robots.js');
|
||||
const robotsParser = (url, contents) => new Robots(url, contents);
|
||||
|
||||
const { expect } = require('chai');
|
||||
|
||||
function testRobots(url, contents, allowed, disallowed) {
|
||||
var robots = robotsParser(url, contents);
|
||||
|
||||
allowed.forEach(function (url) {
|
||||
expect(robots.isAllowed(url)).to.equal(true);
|
||||
});
|
||||
|
||||
disallowed.forEach(function (url) {
|
||||
expect(robots.isDisallowed(url)).to.equal(true);
|
||||
});
|
||||
}
|
||||
|
||||
describe('Robots', function () {
|
||||
it('should parse the disallow directive', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should parse the allow directive', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html',
|
||||
'Allow: /fish/test.html',
|
||||
'Allow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/fish/test.html',
|
||||
'http://www.example.com/Test.html',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should parse patterns', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish*.php',
|
||||
'Disallow: /*.dext$',
|
||||
'Disallow: /dir*'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/Fish.PHP',
|
||||
'http://www.example.com/Fish.dext1',
|
||||
'http://www.example.com/folder/dir.html',
|
||||
'http://www.example.com/folder/dir/test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish.php',
|
||||
'http://www.example.com/fishheads/catfish.php?parameters',
|
||||
'http://www.example.com/AnYthInG.dext',
|
||||
'http://www.example.com/Fish.dext.dext',
|
||||
'http://www.example.com/dir/test.html',
|
||||
'http://www.example.com/directory.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should have the correct order precedence for allow and disallow', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish*.php',
|
||||
'Allow: /fish/index.php',
|
||||
'Disallow: /test',
|
||||
'Allow: /test/',
|
||||
'Disallow: /aa/',
|
||||
'Allow: /aa/',
|
||||
'Allow: /bb/',
|
||||
'Disallow: /bb/',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/test/index.html',
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/test/',
|
||||
'http://www.example.com/aa/',
|
||||
'http://www.example.com/bb/',
|
||||
'http://www.example.com/x/'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish.php',
|
||||
'http://www.example.com/fishheads/catfish.php?parameters',
|
||||
'http://www.example.com/test'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should have the correct order precedence for wildcards', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /*/',
|
||||
'Allow: /x/',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/x/',
|
||||
'http://www.example.com/fish.php',
|
||||
'http://www.example.com/test'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/a/',
|
||||
'http://www.example.com/xx/',
|
||||
'http://www.example.com/test/index.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should parse lines delimitated by \\r', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\r');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should parse lines delimitated by \\r\\n', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\r\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
|
||||
it('should parse lines delimitated by mixed line endings', function () {
|
||||
var contents = [
|
||||
'User-agent: *\r',
|
||||
'Disallow: /fish/\r\n',
|
||||
'Disallow: /test.html\n\n'
|
||||
].join('');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should ignore rules that are not in a group', function () {
|
||||
var contents = [
|
||||
'Disallow: /secret.html',
|
||||
'Disallow: /test',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/secret.html',
|
||||
'http://www.example.com/test/index.html',
|
||||
'http://www.example.com/test/'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, []);
|
||||
});
|
||||
|
||||
|
||||
it('should ignore comments', function () {
|
||||
var contents = [
|
||||
'#',
|
||||
'# This is a comment',
|
||||
'#',
|
||||
'User-agent: *',
|
||||
'# This is a comment',
|
||||
'Disallow: /fish/ # ignore',
|
||||
'# Disallow: fish',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should ignore invalid lines', function () {
|
||||
var contents = [
|
||||
'invalid line',
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
':::::another invalid line:::::',
|
||||
'Disallow: /test.html',
|
||||
'Unknown: tule'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should ignore empty user-agent lines', function () {
|
||||
var contents = [
|
||||
'User-agent:',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html',
|
||||
'http://www.example.com/fish/index.php',
|
||||
'http://www.example.com/fish/',
|
||||
'http://www.example.com/test.html'
|
||||
];
|
||||
|
||||
var disallowed = [];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should support groups with multiple user agents (case insensitive)', function () {
|
||||
var contents = [
|
||||
'User-agent: agenta',
|
||||
'User-agent: agentb',
|
||||
'Disallow: /fish',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false);
|
||||
});
|
||||
|
||||
it('should return undefined for invalid urls', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /secret.html',
|
||||
'Disallow: /test',
|
||||
].join('\n');
|
||||
|
||||
var invalidUrls = [
|
||||
'http://example.com/secret.html',
|
||||
'http://ex ample.com/secret.html',
|
||||
'http://www.example.net/test/index.html',
|
||||
'http://www.examsple.com/test/',
|
||||
'example.com/test/',
|
||||
':::::;;`\\|/.example.com/test/'
|
||||
];
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
invalidUrls.forEach(function (url) {
|
||||
expect(robots.isAllowed(url)).to.equal(undefined);
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle Unicode, urlencoded and punycode URLs', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /secret.html',
|
||||
'Disallow: /test',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.münich.com/index.html',
|
||||
'http://www.xn--mnich-kva.com/index.html',
|
||||
'http://www.m%C3%BCnich.com/index.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.münich.com/secret.html',
|
||||
'http://www.xn--mnich-kva.com/secret.html',
|
||||
'http://www.m%C3%BCnich.com/secret.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed);
|
||||
testRobots('http://www.xn--mnich-kva.com/robots.txt', contents, allowed, disallowed);
|
||||
testRobots('http://www.m%C3%BCnich.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should handle Unicode and urlencoded paths', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /%CF%80',
|
||||
'Disallow: /%e2%9d%83',
|
||||
'Disallow: /%a%a',
|
||||
'Disallow: /💩',
|
||||
'Disallow: /✼*t$',
|
||||
'Disallow: /%E2%9C%A4*t$',
|
||||
'Disallow: /✿%a',
|
||||
'Disallow: /http%3A%2F%2Fexample.org'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/✼testing',
|
||||
'http://www.example.com/%E2%9C%BCtesting',
|
||||
'http://www.example.com/✤testing',
|
||||
'http://www.example.com/%E2%9C%A4testing',
|
||||
'http://www.example.com/http://example.org',
|
||||
'http://www.example.com/http:%2F%2Fexample.org'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/%CF%80',
|
||||
'http://www.example.com/%CF%80/index.html',
|
||||
'http://www.example.com/π',
|
||||
'http://www.example.com/π/index.html',
|
||||
'http://www.example.com/%e2%9d%83',
|
||||
'http://www.example.com/%E2%9D%83/index.html',
|
||||
'http://www.example.com/❃',
|
||||
'http://www.example.com/❃/index.html',
|
||||
'http://www.example.com/%F0%9F%92%A9',
|
||||
'http://www.example.com/%F0%9F%92%A9/index.html',
|
||||
'http://www.example.com/💩',
|
||||
'http://www.example.com/💩/index.html',
|
||||
'http://www.example.com/%a%a',
|
||||
'http://www.example.com/%a%a/index.html',
|
||||
'http://www.example.com/✼test',
|
||||
'http://www.example.com/%E2%9C%BCtest',
|
||||
'http://www.example.com/✤test',
|
||||
'http://www.example.com/%E2%9C%A4testt',
|
||||
'http://www.example.com/✿%a',
|
||||
'http://www.example.com/%E2%9C%BF%atest',
|
||||
'http://www.example.com/http%3A%2F%2Fexample.org'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should handle lone high / low surrogates', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /\uD800',
|
||||
'Disallow: /\uDC00'
|
||||
].join('\n');
|
||||
|
||||
// These are invalid so can't be disallowed
|
||||
var allowed = [
|
||||
'http://www.example.com/\uDC00',
|
||||
'http://www.example.com/\uD800'
|
||||
];
|
||||
|
||||
var disallowed = [];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should ignore host case', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /secret.html',
|
||||
'Disallow: /test',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/index.html',
|
||||
'http://www.ExAmPlE.com/index.html',
|
||||
'http://www.EXAMPLE.com/index.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/secret.html',
|
||||
'http://www.ExAmPlE.com/secret.html',
|
||||
'http://www.EXAMPLE.com/secret.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should handle relative paths', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish',
|
||||
'Allow: /fish/test',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('/robots.txt', contents);
|
||||
expect(robots.isAllowed('/fish/test')).to.equal(true);
|
||||
expect(robots.isAllowed('/fish')).to.equal(false);
|
||||
});
|
||||
|
||||
it('should not allow relative paths if domain specified', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish',
|
||||
'Allow: /fish/test',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
|
||||
expect(robots.isAllowed('/fish')).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should not treat invalid robots.txt URLs as relative', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish',
|
||||
'Allow: /fish/test',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('https://ex ample.com/robots.txt', contents);
|
||||
expect(robots.isAllowed('/fish/test')).to.equal(undefined);
|
||||
expect(robots.isAllowed('/fish')).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should not allow URls if domain specified and robots.txt is relative', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish',
|
||||
'Allow: /fish/test',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('/robots.txt', contents);
|
||||
expect(robots.isAllowed('http://www.example.com/fish/test')).to.equal(undefined);
|
||||
expect(robots.isAllowed('http://www.example.com/fish')).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should allow all if empty robots.txt', function () {
|
||||
var allowed = [
|
||||
'http://www.example.com/secret.html',
|
||||
'http://www.example.com/test/index.html',
|
||||
'http://www.example.com/test/'
|
||||
];
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', '');
|
||||
|
||||
allowed.forEach(function (url) {
|
||||
expect(robots.isAllowed(url)).to.equal(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should treat null as allowing all', function () {
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', null);
|
||||
|
||||
expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true);
|
||||
expect(robots.isAllowed("http://www.example.com/")).to.equal(true);
|
||||
});
|
||||
|
||||
it('should handle invalid robots.txt urls', function () {
|
||||
var contents = [
|
||||
'user-agent: *',
|
||||
'disallow: /',
|
||||
|
||||
'host: www.example.com',
|
||||
'sitemap: /sitemap.xml'
|
||||
].join('\n');
|
||||
|
||||
var sitemapUrls = [
|
||||
undefined,
|
||||
null,
|
||||
'null',
|
||||
':/wom/test/'
|
||||
];
|
||||
|
||||
sitemapUrls.forEach(function (url) {
|
||||
var robots = robotsParser(url, contents);
|
||||
expect(robots.isAllowed('http://www.example.com/index.html')).to.equal(undefined);
|
||||
expect(robots.getPreferredHost()).to.equal('www.example.com');
|
||||
expect(robots.getSitemaps()).to.eql(['/sitemap.xml']);
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse the crawl-delay directive', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow: /d',
|
||||
|
||||
'user-agent: c',
|
||||
'user-agent: d',
|
||||
'crawl-delay: 10'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getCrawlDelay('a')).to.equal(1);
|
||||
expect(robots.getCrawlDelay('b')).to.equal(undefined);
|
||||
expect(robots.getCrawlDelay('c')).to.equal(10);
|
||||
expect(robots.getCrawlDelay('d')).to.equal(10);
|
||||
expect(robots.getCrawlDelay()).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should ignore invalid crawl-delay directives', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1.2.1',
|
||||
|
||||
'user-agent: b',
|
||||
'crawl-delay: 1.a0',
|
||||
|
||||
'user-agent: c',
|
||||
'user-agent: d',
|
||||
'crawl-delay: 10a'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getCrawlDelay('a')).to.equal(undefined);
|
||||
expect(robots.getCrawlDelay('b')).to.equal(undefined);
|
||||
expect(robots.getCrawlDelay('c')).to.equal(undefined);
|
||||
expect(robots.getCrawlDelay('d')).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should parse the sitemap directive', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1',
|
||||
'sitemap: http://example.com/test.xml',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow: /d',
|
||||
|
||||
'sitemap: /sitemap.xml',
|
||||
'sitemap: http://example.com/test/sitemap.xml '
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getSitemaps()).to.eql([
|
||||
'http://example.com/test.xml',
|
||||
'/sitemap.xml',
|
||||
'http://example.com/test/sitemap.xml'
|
||||
]);
|
||||
});
|
||||
|
||||
it('should parse the host directive', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1',
|
||||
'host: www.example.net',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow: /d',
|
||||
|
||||
'host: example.com'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getPreferredHost()).to.equal('example.com');
|
||||
});
|
||||
|
||||
it('should parse empty and invalid directives', function () {
|
||||
var contents = [
|
||||
'user-agent:',
|
||||
'user-agent:::: a::',
|
||||
'crawl-delay:',
|
||||
'crawl-delay:::: 0:',
|
||||
'host:',
|
||||
'host:: example.com',
|
||||
'sitemap:',
|
||||
'sitemap:: site:map.xml',
|
||||
'disallow:',
|
||||
'disallow::: /:',
|
||||
'allow:',
|
||||
'allow::: /:',
|
||||
].join('\n');
|
||||
|
||||
robotsParser('http://www.example.com/robots.txt', contents);
|
||||
});
|
||||
|
||||
it('should treat only the last host directive as valid', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1',
|
||||
'host: www.example.net',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow: /d',
|
||||
|
||||
'host: example.net',
|
||||
'host: example.com'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getPreferredHost()).to.equal('example.com');
|
||||
});
|
||||
|
||||
it('should return null when there is no host directive', function () {
|
||||
var contents = [
|
||||
'user-agent: a',
|
||||
'crawl-delay: 1',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow: /d',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getPreferredHost()).to.equal(null);
|
||||
});
|
||||
|
||||
it('should fallback to * when a UA has no rules of its own', function () {
|
||||
var contents = [
|
||||
'user-agent: *',
|
||||
'crawl-delay: 1',
|
||||
|
||||
'user-agent: b',
|
||||
'crawl-delay: 12',
|
||||
|
||||
'user-agent: c',
|
||||
'user-agent: d',
|
||||
'crawl-delay: 10'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getCrawlDelay('should-fall-back')).to.equal(1);
|
||||
expect(robots.getCrawlDelay('d')).to.equal(10);
|
||||
expect(robots.getCrawlDelay('dd')).to.equal(1);
|
||||
});
|
||||
|
||||
it('should not fallback to * when a UA has rules', function () {
|
||||
var contents = [
|
||||
'user-agent: *',
|
||||
'crawl-delay: 1',
|
||||
|
||||
'user-agent: b',
|
||||
'disallow:'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getCrawlDelay('b')).to.equal(undefined);
|
||||
});
|
||||
|
||||
it('should handle UAs with object property names', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish',
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
expect(robots.isAllowed('http://www.example.com/fish', 'constructor')).to.equal(false);
|
||||
expect(robots.isAllowed('http://www.example.com/fish', '__proto__')).to.equal(false);
|
||||
});
|
||||
|
||||
it('should ignore version numbers in the UA string', function () {
|
||||
var contents = [
|
||||
'user-agent: *',
|
||||
'crawl-delay: 1',
|
||||
|
||||
'user-agent: b',
|
||||
'crawl-delay: 12',
|
||||
|
||||
'user-agent: c',
|
||||
'user-agent: d',
|
||||
'crawl-delay: 10'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getCrawlDelay('should-fall-back/1.0.0')).to.equal(1);
|
||||
expect(robots.getCrawlDelay('d/12')).to.equal(10);
|
||||
expect(robots.getCrawlDelay('dd / 0-32-3')).to.equal(1);
|
||||
expect(robots.getCrawlDelay('b / 1.0')).to.equal(12);
|
||||
});
|
||||
|
||||
|
||||
it('should return the line number of the matching directive', function () {
|
||||
var contents = [
|
||||
'',
|
||||
'User-agent: *',
|
||||
'',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html',
|
||||
'Allow: /fish/test.html',
|
||||
'Allow: /test.html',
|
||||
'',
|
||||
'User-agent: a',
|
||||
'allow: /',
|
||||
'',
|
||||
'User-agent: b',
|
||||
'disallow: /test',
|
||||
'disallow: /t*t',
|
||||
'',
|
||||
'User-agent: c',
|
||||
'Disallow: /fish*.php',
|
||||
'Allow: /fish/index.php'
|
||||
].join('\n');
|
||||
|
||||
var robots = robotsParser('http://www.example.com/robots.txt', contents);
|
||||
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish')).to.equal(-1);
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/test.html')).to.equal(6);
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/Test.html')).to.equal(-1);
|
||||
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php')).to.equal(4);
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/')).to.equal(4);
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html')).to.equal(7);
|
||||
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/test.html', 'a')).to.equal(10);
|
||||
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish.php', 'c')).to.equal(17);
|
||||
expect(robots.getMatchingLineNumber('http://www.example.com/fish/index.php', 'c')).to.equal(18);
|
||||
});
|
||||
|
||||
it('should handle large wildcards efficiently', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /' + '*'.repeat(2048) + '.html',
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com/' + 'sub'.repeat(2048) + 'folder/index.php',
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/secret.html'
|
||||
];
|
||||
|
||||
const start = Date.now();
|
||||
testRobots('http://www.eXample.com/robots.txt', contents, allowed, disallowed);
|
||||
const end = Date.now();
|
||||
|
||||
// Should take less than 500 ms (high to allow for variableness of
|
||||
// machines running the test, should normally be much less)
|
||||
expect(end - start).to.be.lessThan(500);
|
||||
});
|
||||
|
||||
it('should honor given port number', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com:8080/fish',
|
||||
'http://www.example.com:8080/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com/fish',
|
||||
'http://www.example.com/Test.html',
|
||||
'http://www.example.com:80/fish',
|
||||
'http://www.example.com:80/Test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com:8080/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should default to port 80 for http: if no port given', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'http://www.example.com:80/fish',
|
||||
'http://www.example.com:80/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com:443/fish',
|
||||
'http://www.example.com:443/Test.html',
|
||||
'http://www.example.com:80/fish/index.php',
|
||||
'http://www.example.com:80/fish/',
|
||||
'http://www.example.com:80/test.html'
|
||||
];
|
||||
|
||||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should default to port 443 for https: if no port given', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /fish/',
|
||||
'Disallow: /test.html'
|
||||
].join('\n');
|
||||
|
||||
var allowed = [
|
||||
'https://www.example.com:443/fish',
|
||||
'https://www.example.com:443/Test.html',
|
||||
'https://www.example.com/fish',
|
||||
'https://www.example.com/Test.html'
|
||||
];
|
||||
|
||||
var disallowed = [
|
||||
'http://www.example.com:80/fish',
|
||||
'http://www.example.com:80/Test.html',
|
||||
'http://www.example.com:443/fish/index.php',
|
||||
'http://www.example.com:443/fish/',
|
||||
'http://www.example.com:443/test.html'
|
||||
];
|
||||
|
||||
testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
|
||||
});
|
||||
|
||||
it('should not be disallowed when wildcard is used in explicit mode', function () {
|
||||
var contents = [
|
||||
'User-agent: *',
|
||||
'Disallow: /',
|
||||
].join('\n')
|
||||
|
||||
var url = 'https://www.example.com/hello'
|
||||
var userAgent = 'SomeBot';
|
||||
var robots = robotsParser(url, contents);
|
||||
|
||||
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
|
||||
});
|
||||
|
||||
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
|
||||
var contents = [
|
||||
'User-agent: SomeBot',
|
||||
'Disallow: /',
|
||||
].join('\n')
|
||||
|
||||
var url = 'https://www.example.com/hello'
|
||||
var userAgent = 'SomeBot';
|
||||
var robots = robotsParser(url, contents);
|
||||
|
||||
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
|
||||
});
|
||||
|
||||
it('should return undefined when given an invalid URL in explicit mode', function () {
|
||||
var contents = [
|
||||
'User-agent: SomeBot',
|
||||
'Disallow: /',
|
||||
].join('\n')
|
||||
|
||||
var url = 'https://www.example.com/hello'
|
||||
var userAgent = 'SomeBot';
|
||||
var robots = robotsParser('http://example.com', contents);
|
||||
|
||||
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined)
|
||||
});
|
||||
});
|
||||
|
|
@ -7,3 +7,8 @@
|
|||
summary {
|
||||
display: revert;
|
||||
}
|
||||
|
||||
// TODO: Actually a compatibility fix. See https://tobi.antville.org/stories/2317931/
|
||||
td[nowrap][width]:not([width$="%"]) {
|
||||
white-space: normal
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue