wiki-data: uber-simplify matchContentEntries

author: (quasar) nebula <qznebula@protonmail.com> 2026-01-31 17:04:16 -0400
committer: (quasar) nebula <qznebula@protonmail.com> 2026-01-31 17:04:16 -0400
commit: b45c33241cbb559e493fb19a8e775326d69b3d6f (patch)
tree: 0f6a3c7dc6e70e163605aa321c4d3e7cb6615cbe /src
parent: 6c32e484a4ad6be5690c1083b9131d0338e3c5b9 (diff)
2 files changed, 66 insertions, 59 deletions
diff --git a/src/common-util/wiki-data.js b/src/common-util/wiki-data.js
index 36392fcb..e92a80d5 100644
--- a/src/common-util/wiki-data.js
+++ b/src/common-util/wiki-data.js
@@ -66,77 +66,84 @@ export function compareKebabCase(name1, name2) {
 
 // Specific data utilities
 
-// Matches heading details from commentary data in roughly the format:
+// Ostensibly supports these strings. Maybe.
 //
-//    <i>artistText:</i> (annotation, date)
+// * "25 December 2019" - one or two number digits, followed by any text,
+//   followed by four number digits
+// * "December 25, 2019" - one all-letters word, a space, one or two number
+//   digits, a comma, and four number digits
+// * "12/25/2019" etc - three sets of one to four number digits, separated
+//   by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD)
 //
-// where capturing group "annotation" can be any text at all, except that the
-// last entry (past a comma or the only content within parentheses), if parsed
-// as a date, is the capturing group "date". "Parsing as a date" means matching
-// one of these formats:
-//
-//   * "25 December 2019" - one or two number digits, followed by any text,
-//     followed by four number digits
-//   * "December 25, 2019" - one all-letters word, a space, one or two number
-//     digits, a comma, and four number digits
-//   * "12/25/2019" etc - three sets of one to four number digits, separated
-//     by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD)
-//
-// Note that the annotation and date are always wrapped by one opening and one
-// closing parentheses. The whole heading does NOT need to match the entire
-// line it occupies (though it does always start at the first position on that
-// line), and if there is more than one closing parenthesis on the line, the
-// annotation will always cut off only at the last parenthesis, or a comma
-// preceding a date and then the last parenthesis. This is to ensure that
-// parentheses can be part of the actual annotation content.
-//
-// Capturing group "artistReference" is all the characters between <i> and </i>
-// (apart from the pipe and the "artistText" group, if present), and is either
-// the name of one or more artist or "artist:directory"-style references,
-// joined by commas, if multiple.
-//
-// This regular expression *doesn't* match bodies, which will need to be parsed
-// out of the original string based on the indices matched using this.
-//
-
 const dateRegex = groupName =>
-  String.raw`(?<${groupName}>[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|[0-9]{1,2} [^,]*[0-9]{4,4}|[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4})`;
+  String.raw`(?<${groupName}>` +
+    String.raw`[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|` +
+    String.raw`[0-9]{1,2} [^,]*[0-9]{4,4}|` +
+    String.raw`[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4}` +
+  String.raw`)`;
+
+const contentEntryHeadingRegexRaw =
+  String.raw`^<i>(?<artists>.+?):<\/i>(?: \((?<annotation>.*)\))?$`;
+
+const contentEntryHeadingRegex =
+  new RegExp(contentEntryHeadingRegexRaw, 'gm');
+
+const contentEntryAnnotationTailRegexRaw =
+  String.raw`(?:, |^)` +
+
+  String.raw`(?:(?<dateKind>sometime|throughout|around) )?` +
+  String.raw`${dateRegex('date')}` +
+  String.raw`(?: ?- ?${dateRegex('secondDate')})?` +
 
-const commentaryRegexRaw =
-  String.raw`^<i>(?<artistText>.+?):<\/i>(?: \((?<annotation>(?:.*?(?=,|\)[^)]*$))*?)(?:,? ?(?:(?<dateKind>sometime|throughout|around) )?${dateRegex('date')}(?: ?- ?${dateRegex('secondDate')})?(?: (?<accessKind>captured|accessed) ${dateRegex('accessDate')})?)?\))?`;
-const commentaryRegex =
-  new RegExp(commentaryRegexRaw, 'gm');
+  String.raw`(?: ?(?<= )` +
+    String.raw`(?<accessKind>captured|accessed) ${dateRegex('accessDate')}` +
+  String.raw`)?` +
 
-export function matchContentEntries(sourceText) {
-  const matchEntries = [];
+  String.raw`$`;
 
-  let previousMatchEntry = null;
-  let previousEndIndex = null;
+const contentEntryAnnotationTailRegex =
+  new RegExp(contentEntryAnnotationTailRegexRaw);
+
+export function* matchContentEntries(sourceText) {
+  let workingEntry = null;
+  let workingBodyIndex = null;
 
   const trimBody = body =>
     body
       .replace(/^\n*/, '')
       .replace(/\n*$/, '');
 
-  for (const {0: matchText, index: startIndex, groups: matchEntry}
-          of sourceText.matchAll(commentaryRegex)) {
-    if (previousMatchEntry) {
-      previousMatchEntry.body =
-        trimBody(sourceText.slice(previousEndIndex, startIndex));
+  for (const headingMatch of sourceText.matchAll(contentEntryHeadingRegex)) {
+    if (workingEntry) {
+      workingEntry.body =
+        trimBody(sourceText.slice(workingBodyIndex, headingMatch.index));
+
+      yield workingEntry;
     }
 
-    matchEntries.push(matchEntry);
+    workingEntry = {...headingMatch.groups};
 
-    previousMatchEntry = matchEntry;
-    previousEndIndex = startIndex + matchText.length;
-  }
+    if (workingEntry.annotation) {
+      const annotationTailMatch =
+        workingEntry.annotation.match(contentEntryAnnotationTailRegex);
 
-  if (previousMatchEntry) {
-    previousMatchEntry.body =
-      trimBody(sourceText.slice(previousEndIndex));
+      if (annotationTailMatch) {
+        workingEntry.annotation =
+          workingEntry.annotation.slice(0, annotationTailMatch.index);
+
+        Object.assign(workingEntry, annotationTailMatch.groups);
+      }
+    }
+
+    workingBodyIndex = headingMatch.index + headingMatch[0].length;
   }
 
-  return matchEntries;
+  if (workingEntry) {
+    workingEntry.body =
+      trimBody(sourceText.slice(workingBodyIndex));
+
+    yield workingEntry;
+  }
 }
 
 export function filterAlbumsByCommentary(albums) {
diff --git a/src/data/yaml.js b/src/data/yaml.js
index 0a7fce93..85c05b93 100644
--- a/src/data/yaml.js
+++ b/src/data/yaml.js
@@ -945,7 +945,7 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo
     const artistTextNodes =
       Array.from(
         splitContentNodesAround(
-          parseContentNodes(matchEntry.artistText),
+          parseContentNodes(matchEntry.artists),
           /\|/g));
 
     const separatorIndices =
@@ -955,9 +955,9 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo
 
     if (empty(separatorIndices)) {
       if (artistTextNodes.length === 1 && artistTextNodes[0].type === 'text') {
-        artistReferences = matchEntry.artistText;
+        artistReferences = matchEntry.artists;
       } else {
-        artistText = matchEntry.artistText;
+        artistText = matchEntry.artists;
       }
     } else {
       const firstSeparatorIndex =
@@ -968,12 +968,12 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo
         artistTextNodes.length;
 
       artistReferences =
-        matchEntry.artistText.slice(
+        matchEntry.artists.slice(
           artistTextNodes.at(0).i,
           artistTextNodes.at(firstSeparatorIndex - 1).iEnd);
 
       artistText =
-        matchEntry.artistText.slice(
+        matchEntry.artists.slice(
           artistTextNodes.at(firstSeparatorIndex).iEnd,
           artistTextNodes.at(secondSeparatorIndex - 1).iEnd);
     }
@@ -1016,7 +1016,7 @@ export function parseContentEntriesFromSourceText(thingClass, sourceText, {subdo
   }
 
   const documents =
-    matchContentEntries(sourceText)
+    Array.from(matchContentEntries(sourceText))
       .map(matchEntry =>
         withEntries(
           map(matchEntry),
author	(quasar) nebula <qznebula@protonmail.com>	2026-01-31 17:04:16 -0400
committer	(quasar) nebula <qznebula@protonmail.com>	2026-01-31 17:04:16 -0400
commit	b45c33241cbb559e493fb19a8e775326d69b3d6f (patch)
tree	0f6a3c7dc6e70e163605aa321c4d3e7cb6615cbe /src
parent	6c32e484a4ad6be5690c1083b9131d0338e3c5b9 (diff)