diff options
| author | (quasar) nebula <qznebula@protonmail.com> | 2026-01-31 17:04:16 -0400 |
|---|---|---|
| committer | (quasar) nebula <qznebula@protonmail.com> | 2026-01-31 17:04:16 -0400 |
| commit | b45c33241cbb559e493fb19a8e775326d69b3d6f (patch) | |
| tree | 0f6a3c7dc6e70e163605aa321c4d3e7cb6615cbe /src | |
| parent | 6c32e484a4ad6be5690c1083b9131d0338e3c5b9 (diff) | |
wiki-data: uber-simplify matchContentEntries
Diffstat (limited to 'src')
| -rw-r--r-- | src/common-util/wiki-data.js | 113 | ||||
| -rw-r--r-- | src/data/yaml.js | 12 |
2 files changed, 66 insertions, 59 deletions
diff --git a/src/common-util/wiki-data.js b/src/common-util/wiki-data.js index 36392fcb..e92a80d5 100644 --- a/src/common-util/wiki-data.js +++ b/src/common-util/wiki-data.js @@ -66,77 +66,84 @@ export function compareKebabCase(name1, name2) { // Specific data utilities -// Matches heading details from commentary data in roughly the format: +// Ostensibly supports these strings. Maybe. // -// <i>artistText:</i> (annotation, date) +// * "25 December 2019" - one or two number digits, followed by any text, +// followed by four number digits +// * "December 25, 2019" - one all-letters word, a space, one or two number +// digits, a comma, and four number digits +// * "12/25/2019" etc - three sets of one to four number digits, separated +// by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD) // -// where capturing group "annotation" can be any text at all, except that the -// last entry (past a comma or the only content within parentheses), if parsed -// as a date, is the capturing group "date". "Parsing as a date" means matching -// one of these formats: -// -// * "25 December 2019" - one or two number digits, followed by any text, -// followed by four number digits -// * "December 25, 2019" - one all-letters word, a space, one or two number -// digits, a comma, and four number digits -// * "12/25/2019" etc - three sets of one to four number digits, separated -// by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD) -// -// Note that the annotation and date are always wrapped by one opening and one -// closing parentheses. The whole heading does NOT need to match the entire -// line it occupies (though it does always start at the first position on that -// line), and if there is more than one closing parenthesis on the line, the -// annotation will always cut off only at the last parenthesis, or a comma -// preceding a date and then the last parenthesis. This is to ensure that -// parentheses can be part of the actual annotation content. -// -// Capturing group "artistReference" is all the characters between <i> and </i> -// (apart from the pipe and the "artistText" group, if present), and is either -// the name of one or more artist or "artist:directory"-style references, -// joined by commas, if multiple. -// -// This regular expression *doesn't* match bodies, which will need to be parsed -// out of the original string based on the indices matched using this. -// - const dateRegex = groupName => - String.raw`(?<${groupName}>[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|[0-9]{1,2} [^,]*[0-9]{4,4}|[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4})`; + String.raw`(?<${groupName}>` + + String.raw`[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|` + + String.raw`[0-9]{1,2} [^,]*[0-9]{4,4}|` + + String.raw`[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4}` + + String.raw`)`; + +const contentEntryHeadingRegexRaw = + String.raw`^<i>(?<artists>.+?):<\/i>(?: \((?<annotation>.*)\))?$`; + +const contentEntryHeadingRegex = + new RegExp(contentEntryHeadingRegexRaw, 'gm'); + +const contentEntryAnnotationTailRegexRaw = + String.raw`(?:, |^)` + + + String.raw`(?:(?<dateKind>sometime|throughout|around) )?` + + String.raw`${dateRegex('date')}` + + String.raw`(?: ?- ?${dateRegex('secondDate')})?` + -const commentaryRegexRaw = - String.raw`^<i>(?<artistText>.+?):<\/i>(?: \((?<annotation>(?:.*?(?=,|\)[^)]*$))*?)(?:,? ?(?:(?<dateKind>sometime|throughout|around) )?${dateRegex('date')}(?: ?- ?${dateRegex('secondDate')})?(?: (?<accessKind>captured|accessed) ${dateRegex('accessDate')})?)?\))?`; -const commentaryRegex = - new RegExp(commentaryRegexRaw, 'gm'); + String.raw`(?: ?(?<= )` + + String.raw`(?<accessKind>captured|accessed) ${dateRegex('accessDate')}` + + String.raw`)?` + -export function matchContentEntries(sourceText) { - const matchEntries = []; + String.raw`$`; - let previousMatchEntry = null; - let previousEndIndex = null; +const contentEntryAnnotationTailRegex = + new RegExp(contentEntryAnnotationTailRegexRaw); + +export function* matchContentEntries(sourceText) { + let workingEntry = null; + let workingBodyIndex = null; const trimBody = body => body .replace(/^\n*/, '') .replace(/\n*$/, ''); - for (const {0: matchText, index: startIndex, groups: matchEntry} - of sourceText.matchAll(commentaryRegex)) { - if (previousMatchEntry) { - previousMatchEntry.body = - trimBody(sourceText.slice(previousEndIndex, startIndex)); + for (const headingMatch of sourceText.matchAll(contentEntryHeadingRegex)) { + if (workingEntry) { + workingEntry.body = + trimBody(sourceText.slice(workingBodyIndex, headingMatch.index)); + + yield workingEntry; } - matchEntries.push(matchEntry); + workingEntry = {...headingMatch.groups}; - previousMatchEntry = matchEntry; - previousEndIndex = startIndex + matchText.length; - } + if (workingEntry.annotation) { + const annotationTailMatch = + workingEntry.annotation.match(contentEntryAnnotationTailRegex); - if (previousMatchEntry) { - previousMatchEntry.body = - trimBody(sourceText.slice(previousEndIndex)); + if (annotationTailMatch) { + workingEntry.annotation = + workingEntry.annotation.slice(0, annotationTailMatch.index); + + Object.assign(workingEntry, annotationTailMatch.groups); + } + } + + workingBodyIndex = headingMatch.index + headingMatch[0].length; } - return matchEntries; + if (workingEntry) { + workingEntry.body = + trimBody(sourceText.slice(workingBodyIndex)); + + yield workingEntry; + } } export function filterAlbumsByCommentary(albums) { diff --git a/src/data/yaml.js b/src/data/yaml.js index 0a7fce93..85c05b93 100644 --- a/src/data/yaml.js +++ b/src/data/yaml.js @@ -945,7 +945,7 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo const artistTextNodes = Array.from( splitContentNodesAround( - parseContentNodes(matchEntry.artistText), + parseContentNodes(matchEntry.artists), /\|/g)); const separatorIndices = @@ -955,9 +955,9 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo if (empty(separatorIndices)) { if (artistTextNodes.length === 1 && artistTextNodes[0].type === 'text') { - artistReferences = matchEntry.artistText; + artistReferences = matchEntry.artists; } else { - artistText = matchEntry.artistText; + artistText = matchEntry.artists; } } else { const firstSeparatorIndex = @@ -968,12 +968,12 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo artistTextNodes.length; artistReferences = - matchEntry.artistText.slice( + matchEntry.artists.slice( artistTextNodes.at(0).i, artistTextNodes.at(firstSeparatorIndex - 1).iEnd); artistText = - matchEntry.artistText.slice( + matchEntry.artists.slice( artistTextNodes.at(firstSeparatorIndex).iEnd, artistTextNodes.at(secondSeparatorIndex - 1).iEnd); } @@ -1016,7 +1016,7 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo } const documents = - matchContentEntries(sourceText) + Array.from(matchContentEntries(sourceText)) .map(matchEntry => withEntries( map(matchEntry), |