From 57dd7dbdafba63b6edbd330b98072f09957a5492 Mon Sep 17 00:00:00 2001
From: "(quasar) nebula" <qznebula@protonmail.com>
Date: Thu, 1 May 2025 14:01:01 -0300
Subject: data: withSourceText: parse nodes

The node-splitting behavior is itself about identical to what
postprocessExternalLinks still does, so it would be nice to
factor that out, but we haven't done so yet.

Some degree of "parse stuff out of a comma-divided bunch of
source text" is probably worth factoring out too, later.
---
 .../composite/things/content/withSourceText.js     | 171 +++++++++++++++++----
 1 file changed, 139 insertions(+), 32 deletions(-)

(limited to 'src/data')

diff --git a/src/data/composite/things/content/withSourceText.js b/src/data/composite/things/content/withSourceText.js
index cfab64a8..7f03f97d 100644
--- a/src/data/composite/things/content/withSourceText.js
+++ b/src/data/composite/things/content/withSourceText.js
@@ -1,10 +1,70 @@
-import * as marked from 'marked';
-
 import {input, templateCompositeFrom} from '#composite';
-import {matchMarkdownLinks} from '#wiki-data';
+import {parseInput} from '#replacer';
 
 import {raiseOutputWithoutDependency} from '#composite/control-flow';
 
+import {
+  withLengthOfList,
+  withMappedList,
+  withNearbyItemFromList,
+  withPropertyFromObject,
+} from '#composite/data';
+
+function* splitTextNodeAroundCommas(node) {
+  let textNode = {
+    i: node.i,
+    iEnd: null,
+    type: 'text',
+    data: '',
+  };
+
+  let parseFrom = 0;
+  for (const match of node.data.matchAll(/, */g)) {
+    const {index} = match, [{length}] = match;
+
+    textNode.data += node.data.slice(parseFrom, index);
+
+    if (textNode.data) {
+      textNode.iEnd = textNode.i + textNode.data.length;
+      yield textNode;
+
+      textNode = {
+        i: node.i + index + length,
+        iEnd: null,
+        type: 'text',
+        data: '',
+      };
+    }
+
+    yield {
+      i: node.i + index,
+      iEnd: node.i + index + length,
+      type: 'comma-separator',
+    };
+
+    parseFrom = index + length;
+  }
+
+  if (parseFrom !== node.data.length) {
+    textNode.data += node.data.slice(parseFrom);
+    textNode.iEnd = node.iEnd;
+  }
+
+  if (textNode.data) {
+    yield textNode;
+  }
+}
+
+function* splitTextNodesAroundCommas(nodes) {
+  for (const node of nodes) {
+    if (node.type === 'text' && node.data.includes(',')) {
+      yield* splitTextNodeAroundCommas(node);
+    } else {
+      yield node;
+    }
+  }
+}
+
 export default templateCompositeFrom({
   annotation: `withSourceText`,
 
@@ -16,60 +76,107 @@ export default templateCompositeFrom({
       output: input.value({'#sourceText': null}),
     }),
 
+    // Get the list of notes including custom comma-separator nodes,
+    // and do some basic processing to make details about this list
+    // available later.
+
     {
       dependencies: ['annotation'],
       compute: (continuation, {
         ['annotation']: annotation,
       }) => continuation({
-        ['#matches']:
-          Array.from(matchMarkdownLinks(annotation, {marked})),
+        ['#nodes']:
+          Array.from(
+            splitTextNodesAroundCommas(
+              parseInput(annotation))),
       }),
     },
 
-    raiseOutputWithoutDependency({
-      dependency: '#matches',
-      output: input.value({'#sourceText': null}),
-      mode: input.value('empty'),
+    withLengthOfList({
+      list: '#nodes',
+    }),
+
+    withMappedList({
+      list: '#nodes',
+      map: input.value(node => node.type === 'comma-separator'),
+    }).outputs({
+      '#mappedList': '#commaSeparatorFilter',
     }),
 
+    // Identify the first and last nodes in the range running from
+    // the first external link, up til (not including) the following
+    // comma separator.
+
     {
-      dependencies: ['#matches'],
+      dependencies: ['#nodes'],
       compute: (continuation, {
-        ['#matches']: matches,
-      }) =>
-        continuation({
-          ['#startIndex']:
-            matches.at(0).index,
-
-          ['#endIndex']:
-            matches.at(-1).index +
-            matches.at(-1).length,
-        }),
+        ['#nodes']: nodes,
+      }) => continuation({
+        ['#firstExternalLink']:
+          nodes.find(node => node.type === 'external-link'),
+      }),
     },
 
+    raiseOutputWithoutDependency({
+      dependency: '#firstExternalLink',
+      output: input.value({'#sourceText': null}),
+    }),
+
+    withNearbyItemFromList({
+      item: '#firstExternalLink',
+      list: '#nodes',
+      offset: input.value(+1),
+
+      filter: '#commaSeparatorFilter',
+    }).outputs({
+      '#nearbyItem': '#nextCommaSeparator',
+    }),
+
     {
-      dependencies: ['annotation', '#endIndex'],
+      dependencies: [
+        '#firstExternalLink',
+        '#nextCommaSeparator',
+        '#nodes',
+      ],
+
       compute: (continuation, {
-        ['annotation']: annotation,
-        ['#endIndex']: endIndex,
+        ['#firstExternalLink']: firstExternalLink,
+        ['#nextCommaSeparator']: nextCommaSeparator,
+        ['#nodes']: nodes,
       }) => continuation({
-        ['#rest']:
-          annotation.slice(endIndex)
-            .match(/^[^,]*(?=,|$)/),
+        ['#lastNodeInRange']:
+          (nextCommaSeparator
+            ? nodes.at(nodes.indexOf(nextCommaSeparator) - 1)
+            : nodes.at(-1)),
       }),
     },
 
+    // Extract the content text covered by that range.
+
+    withPropertyFromObject({
+      object: '#firstExternalLink',
+      property: input.value('i'),
+    }),
+
+    withPropertyFromObject({
+      object: '#lastNodeInRange',
+      property: input.value('iEnd'),
+    }),
+
     {
-      dependencies: ['annotation', '#startIndex', '#endIndex', '#rest'],
+      dependencies: [
+        '#firstExternalLink.i',
+        '#lastNodeInRange.iEnd',
+        'annotation',
+      ],
+
       compute: (continuation, {
+        ['#firstExternalLink.i']: i,
+        ['#lastNodeInRange.iEnd']: iEnd,
         ['annotation']: annotation,
-        ['#startIndex']: startIndex,
-        ['#endIndex']: endIndex,
-        ['#rest']: rest,
       }) => continuation({
         ['#sourceText']:
-          annotation.slice(startIndex, startIndex + endIndex) +
-          rest,
+          annotation.slice(i, iEnd),
       }),
     },
   ],
-- 
cgit 1.3.0-6-gf8a5