diff options
Diffstat (limited to 'src/replacer.js')
-rw-r--r-- | src/replacer.js | 1035 |
1 files changed, 1035 insertions, 0 deletions
diff --git a/src/replacer.js b/src/replacer.js new file mode 100644 index 00000000..779ee78d --- /dev/null +++ b/src/replacer.js @@ -0,0 +1,1035 @@ +// Regex-based forward parser for wiki content, breaking up text input into +// text and (possibly nested) tag nodes. +// +// The behavior here is quite tied into the `transformContent` content +// function, which converts nodes parsed here into actual HTML, links, etc +// for embedding in a wiki webpage. + +import * as marked from 'marked'; + +import * as html from '#html'; +import {empty, escapeRegex, typeAppearance} from '#sugar'; +import {matchMarkdownLinks} from '#wiki-data'; + +export const replacerSpec = { + 'album': { + find: 'album', + link: 'linkAlbumDynamically', + }, + + 'album-commentary': { + find: 'album', + link: 'linkAlbumCommentary', + }, + + 'album-gallery': { + find: 'album', + link: 'linkAlbumGallery', + }, + + 'album-referenced-artworks': { + find: 'albumWithArtwork', + link: 'linkAlbumReferencedArtworks', + }, + + 'album-referencing-artworks': { + find: 'albumWithArtwork', + link: 'linkAlbumReferencingArtworks', + }, + + 'artist': { + find: 'artist', + link: 'linkArtist', + }, + + 'artist-gallery': { + find: 'artist', + link: 'linkArtistGallery', + }, + + 'commentary-index': { + find: null, + link: 'linkCommentaryIndex', + }, + + 'date': { + find: null, + value: (ref) => new Date(ref), + html: (date, {html, language}) => + html.tag('time', + {datetime: date.toUTCString()}, + language.formatDate(date)), + }, + + 'flash-index': { + find: null, + link: 'linkFlashIndex', + }, + + 'flash': { + find: 'flash', + link: 'linkFlash', + transformName(name, node, input) { + const nextCharacter = input[node.iEnd]; + const lastCharacter = name[name.length - 1]; + if (![' ', '\n', '<'].includes(nextCharacter) && lastCharacter === '.') { + return name.slice(0, -1); + } else { + return name; + } + }, + }, + + 'flash-act': { + find: 'flashAct', + link: 'linkFlashAct', + }, + + 'flash-side': { + find: 'flashSide', + link: 'linkFlashSide', + }, + + 'group': { + find: 'group', + link: 'linkGroup', + }, + + 'group-gallery': { + find: 'group', + link: 'linkGroupGallery', + }, + + 'home': { + find: null, + link: 'linkWikiHomepage', + }, + + 'listing-index': { + find: null, + link: 'linkListingIndex', + }, + + 'listing': { + find: 'listing', + link: 'linkListing', + }, + + 'media': { + find: null, + link: 'linkPathFromMedia', + }, + + 'news-index': { + find: null, + link: 'linkNewsIndex', + }, + + 'news-entry': { + find: 'newsEntry', + link: 'linkNewsEntry', + }, + + 'root': { + find: null, + link: 'linkPathFromRoot', + }, + + 'site': { + find: null, + link: 'linkPathFromSite', + }, + + 'static': { + find: 'staticPage', + link: 'linkStaticPage', + }, + + 'string': { + find: null, + value: (ref) => ref, + html: (ref, {language, args}) => language.$(ref, args), + }, + + 'tag': { + find: 'artTag', + link: 'linkArtTagDynamically', + }, + + 'tag-info': { + find: 'artTag', + link: 'linkArtTagInfo', + }, + + 'track': { + find: 'track', + link: 'linkTrackDynamically', + }, + + 'track-referenced-artworks': { + find: 'trackWithArtwork', + link: 'linkTrackReferencedArtworks', + }, + + 'track-referencing-artworks': { + find: 'trackWithArtwork', + link: 'linkTrackReferencingArtworks', + }, + + 'tooltip': { + value: (ref) => ref, + link: null, + } +}; + +// Syntax literals. +const tagBeginning = '[['; +const tagEnding = ']]'; +const tagReplacerValue = ':'; +const tagHash = '#'; +const tagArgument = '*'; +const tagArgumentValue = '='; +const tagLabel = '|'; + +const noPrecedingWhitespace = '(?<!\\s)'; + +const R_tagBeginning = escapeRegex(tagBeginning); + +const R_tagEnding = escapeRegex(tagEnding); + +const R_tagReplacerValue = + noPrecedingWhitespace + escapeRegex(tagReplacerValue); + +const R_tagHash = noPrecedingWhitespace + escapeRegex(tagHash); + +const R_tagArgument = escapeRegex(tagArgument); + +const R_tagArgumentValue = escapeRegex(tagArgumentValue); + +const R_tagLabel = escapeRegex(tagLabel); + +const regexpCache = {}; + +const makeError = (i, message) => ({i, type: 'error', data: {message}}); +const endOfInput = (i, comment) => + makeError(i, `Unexpected end of input (${comment}).`); + +// These are 8asically stored on the glo8al scope, which might seem odd +// for a recursive function, 8ut the values are only ever used immediately +// after they're set. +let stopped, stop_iParse, stop_literal; + +function parseOneTextNode(input, i, stopAt) { + return parseNodes(input, i, stopAt, true)[0]; +} + +function parseNodes(input, i, stopAt, textOnly) { + let nodes = []; + let string = ''; + let iString = 0; + + stopped = false; + + const pushTextNode = (isLast) => { + string = input.slice(iString, i); + + // If this is the last text node 8efore stopping (at a stopAt match + // or the end of the input), trim off whitespace at the end. + if (isLast) { + string = string.trimEnd(); + } + + string = cleanRawText(string); + + if (string.length) { + nodes.push({i: iString, iEnd: i, type: 'text', data: string}); + string = ''; + } + }; + + const literalsToMatch = stopAt + ? stopAt.concat([R_tagBeginning]) + : [R_tagBeginning]; + + // The 8ackslash stuff here is to only match an even (or zero) num8er + // of sequential 'slashes. Even amounts always cancel out! Odd amounts + // don't, which would mean the following literal is 8eing escaped and + // should 8e counted only as part of the current string/text. + // + // Inspired 8y this: https://stackoverflow.com/a/41470813 + const regexpSource = `(?<!\\\\)(?:\\\\{2})*(${literalsToMatch.join('|')})`; + + // There are 8asically only a few regular expressions we'll ever use, + // 8ut it's a pain to hard-code them all, so we dynamically gener8te + // and cache them for reuse instead. + let regexp; + if (Object.hasOwn(regexpCache, regexpSource)) { + regexp = regexpCache[regexpSource]; + } else { + regexp = new RegExp(regexpSource); + regexpCache[regexpSource] = regexp; + } + + // Skip whitespace at the start of parsing. This is run every time + // parseNodes is called (and thus parseOneTextNode too), so spaces + // at the start of syntax elements will always 8e skipped. We don't + // skip whitespace that shows up inside content (i.e. once we start + // parsing below), though! + const whitespaceOffset = input.slice(i).search(/[^\s]/); + + // If the string is all whitespace, that's just zero content, so + // return the empty nodes array. + if (whitespaceOffset === -1) { + return nodes; + } + + i += whitespaceOffset; + + while (i < input.length) { + const match = input.slice(i).match(regexp); + + if (!match) { + iString = i; + i = input.length; + pushTextNode(true); + break; + } + + const closestMatch = match[0]; + const closestMatchIndex = i + match.index; + + if (textOnly && closestMatch === tagBeginning) + throw makeError(i, `Unexpected [[tag]] - expected only text here.`); + + const stopHere = closestMatch !== tagBeginning; + + iString = i; + i = closestMatchIndex; + pushTextNode(stopHere); + + i += closestMatch.length; + + if (stopHere) { + stopped = true; + stop_iParse = i; + stop_literal = closestMatch; + break; + } + + if (closestMatch === tagBeginning) { + const iTag = closestMatchIndex; + + let N; + + // Replacer key (or value) + + N = parseOneTextNode(input, i, [ + R_tagReplacerValue, + R_tagHash, + R_tagArgument, + R_tagLabel, + R_tagEnding, + ]); + + if (!stopped) throw endOfInput(i, `reading replacer key`); + + if (!N) { + switch (stop_literal) { + case tagReplacerValue: + case tagArgument: + throw makeError(i, `Expected text (replacer key).`); + case tagLabel: + case tagHash: + case tagEnding: + throw makeError(i, `Expected text (replacer key/value).`); + } + } + + const replacerFirst = N; + i = stop_iParse; + + // Replacer value (if explicit) + + let replacerSecond; + + if (stop_literal === tagReplacerValue) { + N = parseNodes(input, i, [ + R_tagHash, + R_tagArgument, + R_tagLabel, + R_tagEnding, + ]); + + if (!stopped) throw endOfInput(i, `reading replacer value`); + if (!N.length) throw makeError(i, `Expected content (replacer value).`); + + replacerSecond = N; + i = stop_iParse; + } + + // Assign first & second to replacer key/value + + let replacerKey, replacerValue; + + // Value is an array of nodes, 8ut key is just one (or null). + // So if we use replacerFirst as the value, we need to stick + // it in an array (on its own). + if (replacerSecond) { + replacerKey = replacerFirst; + replacerValue = replacerSecond; + } else { + replacerKey = null; + replacerValue = [replacerFirst]; + } + + // Hash + + let hash; + + if (stop_literal === tagHash) { + N = parseOneTextNode(input, i, [R_tagArgument, R_tagLabel, R_tagEnding]); + + if (!stopped) throw endOfInput(i, `reading hash`); + if (!N) throw makeError(i, `Expected text (hash).`); + + hash = N; + i = stop_iParse; + } + + // Arguments + + const args = []; + + while (stop_literal === tagArgument) { + N = parseOneTextNode(input, i, [ + R_tagArgumentValue, + R_tagArgument, + R_tagLabel, + R_tagEnding, + ]); + + if (!stopped) throw endOfInput(i, `reading argument key`); + + if (stop_literal !== tagArgumentValue) + throw makeError( + i, + `Expected ${tagArgumentValue.literal} (tag argument).` + ); + + if (!N) throw makeError(i, `Expected text (argument key).`); + + const key = N; + i = stop_iParse; + + N = parseNodes(input, i, [R_tagArgument, R_tagLabel, R_tagEnding]); + + if (!stopped) throw endOfInput(i, `reading argument value`); + if (!N.length) throw makeError(i, `Expected content (argument value).`); + + const value = N; + i = stop_iParse; + + args.push({key, value}); + } + + let label; + + if (stop_literal === tagLabel) { + N = parseOneTextNode(input, i, [R_tagEnding]); + + if (!stopped) throw endOfInput(i, `reading label`); + if (!N) throw makeError(i, `Expected text (label).`); + + label = N; + i = stop_iParse; + } + + nodes.push({ + i: iTag, + iEnd: i, + type: 'tag', + data: {replacerKey, replacerValue, hash, args, label}, + }); + + continue; + } + } + + return nodes; +} + +export function squashBackslashes(text) { + // Squash backslashes which aren't themselves escaped into + // the following character, unless that character is one of + // a set of characters where the backslash carries meaning + // into later formatting (i.e. markdown). Note that we do + // NOT compress double backslashes into single backslashes. + return text.replace(/([^\\](?:\\{2})*)\\(?![\\*_~>.-])/g, '$1'); +} + +export function restoreRawHTMLTags(text) { + // Replace stuff like <html:a> with <a>; these signal that + // the tag shouldn't be processed by the replacer system, + // and should just be embedded into the content as raw HTML. + return text.replace(/<html:(.*?)(?=[ >])/g, '<$1'); +} + +export function cleanRawText(text) { + text = squashBackslashes(text); + text = restoreRawHTMLTags(text); + return text; +} + +export function postprocessComments(inputNodes) { + const outputNodes = []; + + for (const node of inputNodes) { + if (node.type !== 'text') { + outputNodes.push(node); + continue; + } + + const commentRegexp = + new RegExp( + (// Remove comments which occupy entire lines, trimming the line break + // leading into them. These comments never include the ending of a + // comment which does not end a line, which is a regex way of saying + // "please fail early if we hit a --> that doesn't happen at the end + // of the line". + String.raw`\n<!--(?:(?!-->(?!$))[\s\S])*?-->(?=$)` + + '|' + + + // Remove comments which appear at the start of a line, and any + // following spaces. + String.raw`^<!--[\s\S]*?--> *` + + + '|' + + + // Remove comments which appear anywhere else, including in the + // middle of a line or at the end of a line, and any leading spaces. + String.raw` *<!--[\s\S]*?-->`), + + 'gm'); + + outputNodes.push({ + type: 'text', + + data: + node.data.replace(commentRegexp, ''), + + i: node.i, + iEnd: node.iEnd, + }); + } + + return outputNodes; +} + +function postprocessHTMLTags(inputNodes, tagName, callback) { + const outputNodes = []; + const errors = []; + + const lastNode = inputNodes.at(-1); + + const regexp = + new RegExp( + `<${tagName} (.*?)>` + + (html.selfClosingTags.includes(tagName) + ? '' + : `(?:</${tagName}>)?`), + 'g'); + + let atStartOfLine = true; + + for (const node of inputNodes) { + if (node.type === 'tag') { + atStartOfLine = false; + } + + if (node.type === 'text') { + let match = null, parseFrom = 0; + while (match = regexp.exec(node.data)) { + const previousText = node.data.slice(parseFrom, match.index); + + outputNodes.push({ + type: 'text', + data: previousText, + i: node.i + parseFrom, + iEnd: node.i + parseFrom + match.index, + }); + + parseFrom = match.index + match[0].length; + + if (previousText.endsWith('\n')) { + atStartOfLine = true; + } else if (previousText.length) { + atStartOfLine = false; + } + + const attributes = + html.parseAttributes(match[1]); + + const remainingTextInNode = + node.data.slice(parseFrom); + + const inline = (() => { + // If we've already determined we're in the middle of a line, + // we're inline. (Of course!) + if (!atStartOfLine) { + return true; + } + + // If there's more text to go in this text node, and what's + // remaining doesn't start with a line break, we're inline. + if (remainingTextInNode && remainingTextInNode[0] !== '\n') { + return true; + } + + // If we're at the end of this text node, but this text node + // isn't the last node overall, we're inline. + if (!remainingTextInNode && node !== lastNode) { + return true; + } + + // If no other condition matches, this tag is on its own line. + return false; + })(); + + try { + outputNodes.push( + callback(attributes, { + inline, + })); + } catch (caughtError) { + errors.push(new Error( + `Failed to process ${match[0]}`, + {cause: caughtError})); + } + + // No longer at the start of a line after the tag - there will at + // least be text with only '\n' before the next of this tag that's + // on its own line. + atStartOfLine = false; + } + + if (parseFrom !== node.data.length) { + outputNodes.push({ + type: 'text', + data: node.data.slice(parseFrom), + i: node.i + parseFrom, + iEnd: node.iEnd, + }); + } + + continue; + } + + outputNodes.push(node); + } + + if (!empty(errors)) { + throw new AggregateError( + errors, + `Errors postprocessing <${tagName}> tags`); + } + + return outputNodes; +} + +function complainAboutMediaSrc(src) { + if (!src) { + throw new Error(`Missing "src" attribute`); + } + + if (src.startsWith('/media/')) { + throw new Error(`Start "src" with "media/", not "/media/"`); + } +} + +export function postprocessImages(inputNodes) { + return postprocessHTMLTags(inputNodes, 'img', + (attributes, {inline}) => { + const node = {type: 'image'}; + + node.src = attributes.get('src'); + complainAboutMediaSrc(node.src); + + node.inline = attributes.get('inline') ?? inline; + + if (attributes.get('link')) node.link = attributes.get('link'); + if (attributes.get('style')) node.style = attributes.get('style'); + if (attributes.get('width')) node.width = parseInt(attributes.get('width')); + if (attributes.get('height')) node.height = parseInt(attributes.get('height')); + if (attributes.get('align')) node.align = attributes.get('align'); + if (attributes.get('pixelate')) node.pixelate = true; + + if (attributes.get('warning')) { + node.warnings = + attributes.get('warning').split(', '); + } + + return node; + }); +} + +export function postprocessVideos(inputNodes) { + return postprocessHTMLTags(inputNodes, 'video', + (attributes, {inline}) => { + const node = {type: 'video'}; + + node.src = attributes.get('src'); + complainAboutMediaSrc(node.src); + + node.inline = attributes.get('inline') ?? inline; + + if (attributes.get('width')) node.width = parseInt(attributes.get('width')); + if (attributes.get('height')) node.height = parseInt(attributes.get('height')); + if (attributes.get('align')) node.align = attributes.get('align'); + if (attributes.get('pixelate')) node.pixelate = true; + + return node; + }); +} + +export function postprocessAudios(inputNodes) { + return postprocessHTMLTags(inputNodes, 'audio', + (attributes, {inline}) => { + const node = {type: 'audio'}; + + node.src = attributes.get('src'); + complainAboutMediaSrc(node.src); + + node.inline = attributes.get('inline') ?? inline; + + if (attributes.get('align')) node.align = attributes.get('align'); + if (attributes.get('nameless')) node.nameless = true; + + return node; + }); +} + +export function postprocessHeadings(inputNodes) { + const outputNodes = []; + + for (const node of inputNodes) { + if (node.type !== 'text') { + outputNodes.push(node); + continue; + } + + const headingRegexp = /<h2 (.*?)>/g; + + let textContent = ''; + + let match = null, parseFrom = 0; + while (match = headingRegexp.exec(node.data)) { + textContent += node.data.slice(parseFrom, match.index); + parseFrom = match.index + match[0].length; + + const attributes = html.parseAttributes(match[1]); + attributes.push('class', 'content-heading'); + + // We're only modifying the opening tag here. The remaining content, + // including the closing tag, will be pushed as-is. + textContent += `<h2 ${attributes}>`; + } + + if (parseFrom !== node.data.length) { + textContent += node.data.slice(parseFrom); + } + + outputNodes.push({ + type: 'text', + data: textContent, + i: node.i, + iEnd: node.iEnd, + }); + } + + return outputNodes; +} + +export function postprocessSummaries(inputNodes) { + const outputNodes = []; + + for (const node of inputNodes) { + if (node.type !== 'text') { + outputNodes.push(node); + continue; + } + + const summaryRegexp = /<summary>(.*)<\/summary>/g; + + let textContent = ''; + + let match = null, parseFrom = 0; + while (match = summaryRegexp.exec(node.data)) { + textContent += node.data.slice(parseFrom, match.index); + parseFrom = match.index + match[0].length; + + const colorizeWholeSummary = !match[1].includes('<b>'); + + // We're wrapping the contents of the <summary> with a <span>, and + // possibly with a <b>, too. This means we have to add the closing tags + // where the summary ends. + textContent += `<summary><span>`; + textContent += (colorizeWholeSummary ? `<b>` : ``); + textContent += match[1]; + textContent += (colorizeWholeSummary ? `</b>` : ``); + textContent += `</span></summary>`; + } + + if (parseFrom !== node.data.length) { + textContent += node.data.slice(parseFrom); + } + + outputNodes.push({ + type: 'text', + data: textContent, + i: node.i, + iEnd: node.iEnd, + }); + } + + return outputNodes; +} + +export function postprocessExternalLinks(inputNodes) { + const outputNodes = []; + + for (const node of inputNodes) { + if (node.type !== 'text') { + outputNodes.push(node); + continue; + } + + let textNode = { + i: node.i, + iEnd: null, + type: 'text', + data: '', + }; + + let parseFrom = 0; + for (const match of matchMarkdownLinks(node.data, {marked})) { + const {label, href, index, length} = match; + + textNode.data += node.data.slice(parseFrom, index); + + // Split the containing text node into two - the second of these will + // be filled in and pushed by the next match, or after iterating over + // all matches. + if (textNode.data) { + textNode.iEnd = textNode.i + textNode.data.length; + outputNodes.push(textNode); + + textNode = { + i: node.i + index + length, + iEnd: null, + type: 'text', + data: '', + }; + } + + outputNodes.push({ + i: node.i + index, + iEnd: node.i + index + length, + type: 'external-link', + data: {label, href}, + }); + + parseFrom = index + length; + } + + if (parseFrom !== node.data.length) { + textNode.data += node.data.slice(parseFrom); + textNode.iEnd = node.iEnd; + } + + if (textNode.data) { + outputNodes.push(textNode); + } + } + + return outputNodes; +} + +export function parseContentNodes(input, { + errorMode = 'throw', +} = {}) { + if (typeof input !== 'string') { + throw new TypeError(`Expected input to be string, got ${typeAppearance(input)}`); + } + + let result = null, error = null; + + process: { + try { + result = parseNodes(input, 0); + } catch (caughtError) { + if (caughtError.type === 'error') { + const {i, data: {message}} = caughtError; + + let lineStart = input.slice(0, i).lastIndexOf('\n'); + if (lineStart >= 0) { + lineStart += 1; + } else { + lineStart = 0; + } + + let lineEnd = input.slice(i).indexOf('\n'); + if (lineEnd >= 0) { + lineEnd += i; + } else { + lineEnd = input.length; + } + + const line = input.slice(lineStart, lineEnd); + + const cursor = i - lineStart; + + error = + new SyntaxError( + `Parse error (at pos ${i}): ${message}\n` + + line + `\n` + + '-'.repeat(cursor) + '^'); + } else { + error = caughtError; + } + + // A parse error means there's no output to continue with at all, + // so stop here. + break process; + } + + const postprocessErrors = []; + + for (const postprocess of [ + postprocessComments, + postprocessImages, + postprocessVideos, + postprocessAudios, + postprocessHeadings, + postprocessSummaries, + postprocessExternalLinks, + ]) { + try { + result = postprocess(result); + } catch (caughtError) { + const error = + new Error( + `Error in step ${`"${postprocess.name}"`}`, + {cause: caughtError}); + + error[Symbol.for('hsmusic.aggregate.translucent')] = true; + + postprocessErrors.push(error); + } + } + + if (!empty(postprocessErrors)) { + error = + new AggregateError( + postprocessErrors, + `Errors postprocessing content text`); + + error[Symbol.for('hsmusic.aggregate.translucent')] = 'single'; + } + } + + if (errorMode === 'throw') { + if (error) { + throw error; + } else { + return result; + } + } else if (errorMode === 'return') { + if (!result) { + result = [{ + i: 0, + iEnd: input.length, + type: 'text', + data: input, + }]; + } + + return {error, result}; + } else { + throw new Error(`Unknown errorMode ${errorMode}`); + } +} + +export function* splitContentNodesAround(nodes, splitter) { + if (splitter instanceof RegExp) { + const regex = splitter; + + splitter = function*(text) { + for (const match of text.matchAll(regex)) { + yield { + index: match.index, + length: match[0].length, + }; + } + }; + } + + if (typeof splitter === 'string') { + throw new TypeError(`Expected generator or regular expression`); + } + + function* splitTextNode(node) { + let textNode = { + i: node.i, + iEnd: null, + type: 'text', + data: '', + }; + + let parseFrom = 0; + for (const match of splitter(node.data)) { + const {index, length} = match; + + textNode.data += node.data.slice(parseFrom, index); + + if (textNode.data) { + textNode.iEnd = textNode.i + textNode.data.length; + yield textNode; + } + + yield { + i: node.i + index, + iEnd: node.i + index + length, + type: 'separator', + data: { + text: node.data.slice(index, index + length), + match, + }, + }; + + textNode = { + i: node.i + index + length, + iEnd: null, + type: 'text', + data: '', + }; + + parseFrom = index + length; + } + + if (parseFrom !== node.data.length) { + textNode.data += node.data.slice(parseFrom); + textNode.iEnd = node.iEnd; + } + + if (textNode.data) { + yield textNode; + } + } + + for (const node of nodes) { + if (node.type === 'text') { + yield* splitTextNode(node); + } else { + yield node; + } + } +} |