1 files changed, 60 insertions, 53 deletions
diff --git a/src/common-util/wiki-data.js b/src/common-util/wiki-data.js
index 36392fcb..e92a80d5 100644
--- a/src/common-util/wiki-data.js
+++ b/src/common-util/wiki-data.js
@@ -66,77 +66,84 @@ export function compareKebabCase(name1, name2) {
 
 // Specific data utilities
 
-// Matches heading details from commentary data in roughly the format:
+// Ostensibly supports these strings. Maybe.
 //
-//    <i>artistText:</i> (annotation, date)
+// * "25 December 2019" - one or two number digits, followed by any text,
+//   followed by four number digits
+// * "December 25, 2019" - one all-letters word, a space, one or two number
+//   digits, a comma, and four number digits
+// * "12/25/2019" etc - three sets of one to four number digits, separated
+//   by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD)
 //
-// where capturing group "annotation" can be any text at all, except that the
-// last entry (past a comma or the only content within parentheses), if parsed
-// as a date, is the capturing group "date". "Parsing as a date" means matching
-// one of these formats:
-//
-//   * "25 December 2019" - one or two number digits, followed by any text,
-//     followed by four number digits
-//   * "December 25, 2019" - one all-letters word, a space, one or two number
-//     digits, a comma, and four number digits
-//   * "12/25/2019" etc - three sets of one to four number digits, separated
-//     by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD)
-//
-// Note that the annotation and date are always wrapped by one opening and one
-// closing parentheses. The whole heading does NOT need to match the entire
-// line it occupies (though it does always start at the first position on that
-// line), and if there is more than one closing parenthesis on the line, the
-// annotation will always cut off only at the last parenthesis, or a comma
-// preceding a date and then the last parenthesis. This is to ensure that
-// parentheses can be part of the actual annotation content.
-//
-// Capturing group "artistReference" is all the characters between <i> and </i>
-// (apart from the pipe and the "artistText" group, if present), and is either
-// the name of one or more artist or "artist:directory"-style references,
-// joined by commas, if multiple.
-//
-// This regular expression *doesn't* match bodies, which will need to be parsed
-// out of the original string based on the indices matched using this.
-//
-
 const dateRegex = groupName =>
-  String.raw`(?<${groupName}>[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|[0-9]{1,2} [^,]*[0-9]{4,4}|[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4})`;
+  String.raw`(?<${groupName}>` +
+    String.raw`[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|` +
+    String.raw`[0-9]{1,2} [^,]*[0-9]{4,4}|` +
+    String.raw`[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4}` +
+  String.raw`)`;
+
+const contentEntryHeadingRegexRaw =
+  String.raw`^<i>(?<artists>.+?):<\/i>(?: \((?<annotation>.*)\))?$`;
+
+const contentEntryHeadingRegex =
+  new RegExp(contentEntryHeadingRegexRaw, 'gm');
+
+const contentEntryAnnotationTailRegexRaw =
+  String.raw`(?:, |^)` +
+
+  String.raw`(?:(?<dateKind>sometime|throughout|around) )?` +
+  String.raw`${dateRegex('date')}` +
+  String.raw`(?: ?- ?${dateRegex('secondDate')})?` +
 
-const commentaryRegexRaw =
-  String.raw`^<i>(?<artistText>.+?):<\/i>(?: \((?<annotation>(?:.*?(?=,|\)[^)]*$))*?)(?:,? ?(?:(?<dateKind>sometime|throughout|around) )?${dateRegex('date')}(?: ?- ?${dateRegex('secondDate')})?(?: (?<accessKind>captured|accessed) ${dateRegex('accessDate')})?)?\))?`;
-const commentaryRegex =
-  new RegExp(commentaryRegexRaw, 'gm');
+  String.raw`(?: ?(?<= )` +
+    String.raw`(?<accessKind>captured|accessed) ${dateRegex('accessDate')}` +
+  String.raw`)?` +
 
-export function matchContentEntries(sourceText) {
-  const matchEntries = [];
+  String.raw`$`;
 
-  let previousMatchEntry = null;
-  let previousEndIndex = null;
+const contentEntryAnnotationTailRegex =
+  new RegExp(contentEntryAnnotationTailRegexRaw);
+
+export function* matchContentEntries(sourceText) {
+  let workingEntry = null;
+  let workingBodyIndex = null;
 
   const trimBody = body =>
     body
       .replace(/^\n*/, '')
       .replace(/\n*$/, '');
 
-  for (const {0: matchText, index: startIndex, groups: matchEntry}
-          of sourceText.matchAll(commentaryRegex)) {
-    if (previousMatchEntry) {
-      previousMatchEntry.body =
-        trimBody(sourceText.slice(previousEndIndex, startIndex));
+  for (const headingMatch of sourceText.matchAll(contentEntryHeadingRegex)) {
+    if (workingEntry) {
+      workingEntry.body =
+        trimBody(sourceText.slice(workingBodyIndex, headingMatch.index));
+
+      yield workingEntry;
     }
 
-    matchEntries.push(matchEntry);
+    workingEntry = {...headingMatch.groups};
 
-    previousMatchEntry = matchEntry;
-    previousEndIndex = startIndex + matchText.length;
-  }
+    if (workingEntry.annotation) {
+      const annotationTailMatch =
+        workingEntry.annotation.match(contentEntryAnnotationTailRegex);
 
-  if (previousMatchEntry) {
-    previousMatchEntry.body =
-      trimBody(sourceText.slice(previousEndIndex));
+      if (annotationTailMatch) {
+        workingEntry.annotation =
+          workingEntry.annotation.slice(0, annotationTailMatch.index);
+
+        Object.assign(workingEntry, annotationTailMatch.groups);
+      }
+    }
+
+    workingBodyIndex = headingMatch.index + headingMatch[0].length;
   }
 
-  return matchEntries;
+  if (workingEntry) {
+    workingEntry.body =
+      trimBody(sourceText.slice(workingBodyIndex));
+
+    yield workingEntry;
+  }
 }
 
 export function filterAlbumsByCommentary(albums) {