diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 98ca7656..0bc7cbb7 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,15 +1,48 @@ -name: Deploy (Production) +name: Deploy -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + hostname: + description: Hostname + type: string + required: true + default: antville.org jobs: - deploy: + stage: runs-on: antville environment: - name: antville.org - url: https://antville.org + name: production + url: ${{ inputs.hostname }} steps: - - name: Copy files to production server - run: ssh staging-server deploy-antville + - uses: actions/checkout@v4 + + - name: Build with Gradle + run: ./gradlew :build + + - name: Copy files to server + run: | + rsync ./build/install/antville/ ${{ inputs.hostname }}:./apps/antville/ \ + --archive --compress --delete --verbose \ + --filter '+ /claustra' \ + --filter '+ /code' \ + --filter '+ /compat' \ + --filter '+ /db' \ + --filter '+ /i18n' \ + --filter '+ /lib' \ + --filter '- /*' + rsync ./build/install/antville/static/ ${{ inputs.hostname }}:./apps/antville/static/ \ + --archive --compress --verbose \ + --filter '+ /fonts' \ + --filter '+ /formica.html' \ + --filter '+ /img' \ + --filter '+ /scripts' \ + --filter '+ /styles' \ + --filter '- /*' + + - name: Restart Helma + run: ssh ${{ inputs.hostname }} restart + diff --git a/.github/workflows/stage.yml b/.github/workflows/stage.yml deleted file mode 100644 index c53cc5e1..00000000 --- a/.github/workflows/stage.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Deploy (Staging) - -on: workflow_dispatch - -jobs: - stage: - runs-on: antville - - environment: - name: stage - url: ${{ vars.stage_url }} - - steps: - - uses: actions/checkout@v4 - - - name: Build with Gradle - run: ./gradlew :build - - - name: Publish to staging server - # The rsync command applies the same filters as the one in tools/extras/deploy.sh - run: | - rsync ./build/install/antville/ staging-server:./apps/antville/ \ - --archive --compress --delete --verbose \ - --filter '+ /claustra' \ - --filter '+ /code' \ - --filter '+ /compat' \ - --filter '+ /db' \ - --filter '+ /i18n' \ - --filter '+ /lib' \ - --filter '- /*' - rsync ./build/install/antville/static/ staging-server:./apps/antville/static/ \ - --archive --compress --verbose \ - --filter '+ /fonts' \ - --filter '+ /formica.html' \ - --filter '+ /img' \ - --filter '+ /scripts' \ - --filter '+ /styles' \ - --filter '- /*' - - - name: Restart Helma - run: ssh staging-server restart - diff --git a/.nvmrc b/.nvmrc index a24992d1..b009dfb9 100644 --- a/.nvmrc +++ b/.nvmrc @@ -1 +1 @@ -lts +lts/* diff --git a/build.gradle b/build.gradle index ad84c217..6c520f17 100644 --- a/build.gradle +++ b/build.gradle @@ -51,7 +51,7 @@ dependencies { implementation 'org.commonmark:commonmark-ext-autolink:0.24.0' implementation 'org.commonmark:commonmark-ext-gfm-strikethrough:0.24.0' implementation 'org.commonmark:commonmark-ext-gfm-tables:0.24.0' - implementation 'org.jsoup:jsoup:1.20.1' + implementation 'org.jsoup:jsoup:1.21.1' implementation 'rome:rome:1.0' lessCss('org.lesscss:lesscss:1.7.0.1.1') { diff --git a/claustra/proxy/Proxy/Proxy.js b/claustra/proxy/Proxy/Proxy.js index effe0a80..e46caabf 100644 --- a/claustra/proxy/Proxy/Proxy.js +++ b/claustra/proxy/Proxy/Proxy.js @@ -46,7 +46,7 @@ Proxy.prototype.main_action = function () { let content = new java.lang.String(data.content, 'utf-8'); - if (!data.type.startsWith('text/')) { + if (data.type && !data.type.startsWith('text/')) { content = new java.lang.String(content.enbase64()); } @@ -55,10 +55,10 @@ Proxy.prototype.main_action = function () { } else { res.contentType = data.type; - if (data.type.startsWith('text/')) { - res.write(java.lang.String(data.content, 'utf-8')); - } else { + if (data.type && !data.type.startsWith('text/')) { res.writeBinary(data.content); + } else { + res.write(java.lang.String(data.content, 'utf-8')); } } }; diff --git a/code/Global/Robots.js b/code/Global/Robots.js new file mode 100644 index 00000000..04c69d67 --- /dev/null +++ b/code/Global/Robots.js @@ -0,0 +1,543 @@ +// Robots parser adapted for Rhino-compatible JavaScript +// Source: +// Copyright (c) 2014 Sam Clarke +// Copyright (c) 2025 Antville.org +// MIT License (MIT) + +// Transformation steps: +// 1. Add IIFE around the code +// 2. Replace module.exports with return statement +// 3. Add conditional module.exports for CommonJS support +// 4. Add URL class imitation + +var Robots = (() => { + /** + * Half-baked (read-only) imitation of the URL class of Node.js + */ + function nodeJsUrl(str, base) { + if (!str.includes('://')) { + str = (base || 'http://localhost') + str; + } + + const url = new java.net.URL(str); + const port = url.port < 0 ? '' : url.port; + const userInfo = (url.getUserInfo() || "").split(':'); + + return { + hash: url.ref ? '#' + url.ref : '', + href: url.toString(), + host: url.host + (port ? ':' + port : port), + hostname: url.host, + password: userInfo[1] || "", + pathname: url.path, + origin: url.protocol + '://' + url.host + (port ? ':' + port : port), + port, + protocol: url.protocol, + search: url.queryy ? '?' + url.query : '', + searchParams: { + get: () => null, + set: () => null + }, + username: userInfo[0] || "", + }; + } + + if (typeof URL === 'undefined') { + globalThis.URL = nodeJsUrl; + } + + /** + * Trims the white space from the start and end of the line. + * + * If the line is an array it will strip the white space from + * the start and end of each element of the array. + * + * @param {string|Array} line + * @return {string|Array} + * @private + */ + function trimLine(line) { + if (!line) { + return null; + } + + if (Array.isArray(line)) { + return line.map(trimLine); + } + + return String(line).trim(); + } + + /** + * Remove comments from lines + * + * @param {string} line + * @return {string} + * @private + */ + function removeComments(line) { + var commentStartIndex = line.indexOf('#'); + if (commentStartIndex > -1) { + return line.substr(0, commentStartIndex); + } + + return line; + } + + /** + * Splits a line at the first occurrence of : + * + * @param {string} line + * @return {Array.} + * @private + */ + function splitLine(line) { + var idx = String(line).indexOf(':'); + + if (!line || idx < 0) { + return null; + } + + return [line.slice(0, idx), line.slice(idx + 1)]; + } + + /** + * Normalises the user-agent string by converting it to + * lower case and removing any version numbers. + * + * @param {string} userAgent + * @return {string} + * @private + */ + function formatUserAgent(userAgent) { + var formattedUserAgent = userAgent.toLowerCase(); + + // Strip the version number from robot/1.0 user agents + var idx = formattedUserAgent.indexOf('/'); + if (idx > -1) { + formattedUserAgent = formattedUserAgent.substr(0, idx); + } + + return formattedUserAgent.trim(); + } + + /** + * Normalises the URL encoding of a path by encoding + * unicode characters. + * + * @param {string} path + * @return {string} + * @private + */ + function normaliseEncoding(path) { + try { + return urlEncodeToUpper(encodeURI(path).replace(/%25/g, '%')); + } catch (e) { + return path; + } + } + + /** + * Convert URL encodings to support case. + * + * e.g.: %2a%ef becomes %2A%EF + * + * @param {string} path + * @return {string} + * @private + */ + function urlEncodeToUpper(path) { + return path.replace(/%[0-9a-fA-F]{2}/g, function (match) { + return match.toUpperCase(); + }); + } + + /** + * Matches a pattern with the specified path + * + * Uses same algorithm to match patterns as the Google implementation in + * google/robotstxt so it should be consistent with the spec. + * + * @see https://github.com/google/robotstxt/blob/f465f0ede81099dd8bc4aeb2966b3a892bd488b3/robots.cc#L74 + * @param {string} pattern + * @param {string} path + * @return {boolean} + * @private + */ + function matches(pattern, path) { + // I've added extra comments to try make this easier to understand + + // Stores the lengths of all the current matching substrings. + // Maximum number of possible matching lengths is every length in path plus + // 1 to handle 0 length too (if pattern starts with * which is zero or more) + var matchingLengths = new Array(path.length + 1); + var numMatchingLengths = 1; + + // Initially longest match is 0 + matchingLengths[0] = 0; + + for (var p = 0; p < pattern.length; p++) { + // If $ is at the end of pattern then we must match the whole path. + // Which is true if the longest matching length matches path length + if (pattern[p] === '$' && p + 1 === pattern.length) { + return matchingLengths[numMatchingLengths - 1] === path.length; + } + + // Handle wildcards + if (pattern[p] == '*') { + // Wildcard so all substrings minus the current smallest matching + // length are matches + numMatchingLengths = path.length - matchingLengths[0] + 1; + + // Update matching lengths to include the smallest all the way up + // to numMatchingLengths + // Don't update smallest possible match as * matches zero or more + // so the smallest current match is also valid + for (var i = 1; i < numMatchingLengths; i++) { + matchingLengths[i] = matchingLengths[i - 1] + 1; + } + } else { + // Check the char at the matching length matches the pattern, if it + // does increment it and add it as a valid length, ignore if not. + var numMatches = 0; + for (var i = 0; i < numMatchingLengths; i++) { + if ( + matchingLengths[i] < path.length && + path[matchingLengths[i]] === pattern[p] + ) { + matchingLengths[numMatches++] = matchingLengths[i] + 1; + } + } + + // No paths matched the current pattern char so not a match + if (numMatches == 0) { + return false; + } + + numMatchingLengths = numMatches; + } + } + + return true; + } + + function parseRobots(contents, robots) { + var newlineRegex = /\r\n|\r|\n/; + var lines = contents + .split(newlineRegex) + .map(removeComments) + .map(splitLine) + .map(trimLine); + + var currentUserAgents = []; + var isNoneUserAgentState = true; + for (var i = 0; i < lines.length; i++) { + var line = lines[i]; + + if (!line || !line[0]) { + continue; + } + + switch (line[0].toLowerCase()) { + case 'user-agent': + if (isNoneUserAgentState) { + currentUserAgents.length = 0; + } + + if (line[1]) { + currentUserAgents.push(formatUserAgent(line[1])); + } + break; + case 'disallow': + robots.addRule(currentUserAgents, line[1], false, i + 1); + break; + case 'allow': + robots.addRule(currentUserAgents, line[1], true, i + 1); + break; + case 'crawl-delay': + robots.setCrawlDelay(currentUserAgents, line[1]); + break; + case 'sitemap': + if (line[1]) { + robots.addSitemap(line[1]); + } + break; + case 'host': + if (line[1]) { + robots.setPreferredHost(line[1].toLowerCase()); + } + break; + } + + isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent'; + } + } + + /** + * Returns if a pattern is allowed by the specified rules. + * + * @param {string} path + * @param {Array.} rules + * @return {Object?} + * @private + */ + function findRule(path, rules) { + var matchedRule = null; + + for (var i = 0; i < rules.length; i++) { + var rule = rules[i]; + + if (!matches(rule.pattern, path)) { + continue; + } + + // The longest matching rule takes precedence + // If rules are the same length then allow takes precedence + if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) { + matchedRule = rule; + } else if ( + rule.pattern.length == matchedRule.pattern.length && + rule.allow && + !matchedRule.allow + ) { + matchedRule = rule; + } + } + + return matchedRule; + } + + /** + * Converts provided string into an URL object. + * + * Will return null if provided string is not a valid URL. + * + * @param {string} url + * @return {?URL} + * @private + */ + function parseUrl(url) { + try { + // Specify a URL to be used with relative paths + // Using non-existent subdomain so can never cause conflict unless + // trying to crawl it but doesn't exist and even if tried worst that can + // happen is it allows relative URLs on it. + var url = new URL(url, 'http://robots-relative.samclarke.com/'); + + if (!url.port) { + url.port = url.protocol === 'https:' ? 443 : 80; + } + + return url; + } catch (e) { + return null; + } + } + + function Robots(url, contents) { + this._url = parseUrl(url) || {}; + this._rules = Object.create(null); + this._sitemaps = []; + this._preferredHost = null; + + parseRobots(contents || '', this); + } + + /** + * Adds the specified allow/deny rule to the rules + * for the specified user-agents. + * + * @param {Array.} userAgents + * @param {string} pattern + * @param {boolean} allow + * @param {number} [lineNumber] Should use 1-based indexing + */ + Robots.prototype.addRule = function (userAgents, pattern, allow, lineNumber) { + var rules = this._rules; + + userAgents.forEach(function (userAgent) { + rules[userAgent] = rules[userAgent] || []; + + if (!pattern) { + return; + } + + rules[userAgent].push({ + pattern: normaliseEncoding(pattern), + allow: allow, + lineNumber: lineNumber + }); + }); + }; + + /** + * Adds the specified delay to the specified user agents. + * + * @param {Array.} userAgents + * @param {string} delayStr + */ + Robots.prototype.setCrawlDelay = function (userAgents, delayStr) { + var rules = this._rules; + var delay = Number(delayStr); + + userAgents.forEach(function (userAgent) { + rules[userAgent] = rules[userAgent] || []; + + if (isNaN(delay)) { + return; + } + + rules[userAgent].crawlDelay = delay; + }); + }; + + /** + * Add a sitemap + * + * @param {string} url + */ + Robots.prototype.addSitemap = function (url) { + this._sitemaps.push(url); + }; + + /** + * Sets the preferred host name + * + * @param {string} url + */ + Robots.prototype.setPreferredHost = function (url) { + this._preferredHost = url; + }; + + Robots.prototype._getRule = function (url, ua, explicit) { + var parsedUrl = parseUrl(url) || {}; + var userAgent = formatUserAgent(ua || '*'); + + // The base URL must match otherwise this robots.txt is not valid for it. + if ( + parsedUrl.protocol !== this._url.protocol || + parsedUrl.hostname !== this._url.hostname || + parsedUrl.port !== this._url.port + ) { + return; + } + + var rules = this._rules[userAgent]; + if (!explicit) { + rules = rules || this._rules['*']; + } + rules = rules || []; + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); + var rule = findRule(path, rules); + + return rule; + }; + + /** + * Returns true if allowed, false if not allowed. + * + * Will return undefined if the URL is not valid for + * this robots.txt file. + * + * @param {string} url + * @param {string?} ua + * @return {boolean?} + */ + Robots.prototype.isAllowed = function (url, ua) { + var rule = this._getRule(url, ua, false); + + if (typeof rule === 'undefined') { + return; + } + + return !rule || rule.allow; + }; + + /** + * Returns the line number of the matching directive for the specified + * URL and user-agent if any. + * + * The line numbers start at 1 and go up (1-based indexing). + * + * Return -1 if there is no matching directive. If a rule is manually + * added without a lineNumber then this will return undefined for that + * rule. + * + * @param {string} url + * @param {string?} ua + * @return {number?} + */ + Robots.prototype.getMatchingLineNumber = function (url, ua) { + var rule = this._getRule(url, ua, false); + + return rule ? rule.lineNumber : -1; + }; + + /** + * Returns the opposite of isAllowed() + * + * @param {string} url + * @param {string?} ua + * @return {boolean} + */ + Robots.prototype.isDisallowed = function (url, ua) { + return !this.isAllowed(url, ua); + }; + + /** + * Returns trues if explicitly disallowed + * for the specified user agent (User Agent wildcards are discarded). + * + * This will return undefined if the URL is not valid for this robots.txt file. + * + * @param {string} url + * @param {string} ua + * @return {boolean?} + */ + Robots.prototype.isExplicitlyDisallowed = function (url, ua) { + var rule = this._getRule(url, ua, true); + if (typeof rule === 'undefined') { + return; + } + + return !(!rule || rule.allow); + }; + + /** + * Gets the crawl delay if there is one. + * + * Will return undefined if there is no crawl delay set. + * + * @param {string} ua + * @return {number?} + */ + Robots.prototype.getCrawlDelay = function (ua) { + var userAgent = formatUserAgent(ua || '*'); + + return (this._rules[userAgent] || this._rules['*'] || {}).crawlDelay; + }; + + /** + * Returns the preferred host if there is one. + * + * @return {string?} + */ + Robots.prototype.getPreferredHost = function () { + return this._preferredHost; + }; + + /** + * Returns an array of sitemap URLs if there are any. + * + * @return {Array.} + */ + Robots.prototype.getSitemaps = function () { + return this._sitemaps.slice(0); + }; + + return Robots; +})(); + +if (typeof module !== 'undefined' && module.exports) { + module.exports = Robots; +} diff --git a/code/HopObject/HopObject.js b/code/HopObject/HopObject.js index 9e35a00d..b9983c5d 100644 --- a/code/HopObject/HopObject.js +++ b/code/HopObject/HopObject.js @@ -140,10 +140,18 @@ HopObject.prototype.onRequest = function() { } } + // Set up layout handler and skin path HopObject.confirmConstructor(Layout); res.handlers.layout = res.handlers.site.layout || new Layout; res.skinpath = res.handlers.layout.getSkinPath(); + if (res.handlers.site.enforceRobotsTxt()) { + res.status = 403 + res.data.error = gettext('The robots.txt file disallows access to this page.', res.handlers.site.href('robots.txt')); + root.error_action(); + res.stop(); + } + if (!this.getPermission(req.action)) { if (!session.user) { User.setLocation(root.href() + req.path); diff --git a/code/Members/Members.js b/code/Members/Members.js index 112b6f84..e3670076 100644 --- a/code/Members/Members.js +++ b/code/Members/Members.js @@ -202,7 +202,23 @@ Members.prototype.login_action = function() { } res.message = gettext('Welcome to {0}, {1}. Have fun!', res.handlers.site.getTitle(), user.name); - res.redirect(User.getLocation() || this._parent.href()); + + const location = User.getLocation() || this._parent.href(); + + // If the requested host is outside of the cookie domain, redirect and login to the root site, too + if (this._parent !== root && !req.getHeader("Host").includes(app.appsProperties.cookieDomain)) { + const token = java.util.UUID.randomUUID(); + const digest = session.user.getDigest(token); + session.user.setMetadata('rootCookieToken', token); + res.redirect( + root.href('cookie') + + '?digest=' + encodeURIComponent(digest) + + '&name=' + encodeURIComponent(req.postParams.name) + + '&location=' + encodeURIComponent(location) + ); + } + + res.redirect(location); } catch (ex) { res.message = ex; } diff --git a/code/Root/$Root.skin b/code/Root/$Root.skin index a99f16ba..2cc4a105 100644 --- a/code/Root/$Root.skin +++ b/code/Root/$Root.skin @@ -136,16 +136,16 @@ (<% param.helmaBuildDate %>)
<% gettext "Scripting Engine" %>
-
<% param.rhino %>
+
<% param.rhino %>
<% gettext "Webserver" %>
-
Jetty <% param.jetty %>
+
Jetty <% param.jetty %>
<% gettext "Servlet Interface" %>
- - Javax <% param.servlet %> + + Jakarta <% param.servlet %>
<% gettext "Virtual Machine" %>
-
Java <% param.java %>
+
Java <% param.java %>
diff --git a/code/Root/Root.js b/code/Root/Root.js index 0207fea6..afe4efa3 100644 --- a/code/Root/Root.js +++ b/code/Root/Root.js @@ -94,6 +94,7 @@ Root.prototype.getPermission = function(action) { switch (action) { case '.': case 'main': + case 'cookie': case 'debug': case 'default.hook': case 'favicon.ico': @@ -367,6 +368,23 @@ Root.prototype.mrtg_action = function() { return; } +// Login to the root site if Members#login_action() redirects here +// This way custom domains are getting the default domain cookie, too +Root.prototype.cookie_action = function() { + if (req.data.digest && req.data.name) { + const user = User.getByName(req.data.name); + if (user) { + const token = user.getMetadata("rootCookieToken"); + const digest = user.getDigest(token); + if (digest === req.data.digest) { + session.login(user); + user.deleteMetadata("rootCookieToken"); + } + } + } + res.redirect(req.data.location || req.data.http_referer || root.href()); +}; + /** * Catch some undefined macro handlers, then delegate to the super prototype. * @param {String} name diff --git a/code/Site/$Site.skin b/code/Site/$Site.skin index a5098f36..42d4fc1d 100644 --- a/code/Site/$Site.skin +++ b/code/Site/$Site.skin @@ -143,6 +143,22 @@ +
+ + +
+ +

+ <% gettext 'Edit the rules in the robots.txt skin.' <% site.layout.skins.href %> %> +

+
+
+