data: withSourceText: parse nodes

The node-splitting behavior is itself about identical to what postprocessExternalLinks still does, so it would be nice to factor that out, but we haven't done so yet. Some degree of "parse stuff out of a comma-divided bunch of source text" is probably worth factoring out too, later.
author: (quasar) nebula <qznebula@protonmail.com> 2025-05-01 14:01:01 -0300
committer: (quasar) nebula <qznebula@protonmail.com> 2025-05-06 12:29:05 -0300
commit: 57dd7dbdafba63b6edbd330b98072f09957a5492 (patch)
tree: 529779314f34e53ba116422fe68a1fa97c5e9cc8 /src/data/composite/things/content/withSourceText.js
parent: 9f14f1dfc7aa6c00c0cfa07577208ad1bdcc62f7 (diff)
1 files changed, 139 insertions, 32 deletions
diff --git a/src/data/composite/things/content/withSourceText.js b/src/data/composite/things/content/withSourceText.js
index cfab64a8..7f03f97d 100644
--- a/src/data/composite/things/content/withSourceText.js
+++ b/src/data/composite/things/content/withSourceText.js
@@ -1,10 +1,70 @@
-import * as marked from 'marked';
-
 import {input, templateCompositeFrom} from '#composite';
-import {matchMarkdownLinks} from '#wiki-data';
+import {parseInput} from '#replacer';
 
 import {raiseOutputWithoutDependency} from '#composite/control-flow';
 
+import {
+  withLengthOfList,
+  withMappedList,
+  withNearbyItemFromList,
+  withPropertyFromObject,
+} from '#composite/data';
+
+function* splitTextNodeAroundCommas(node) {
+  let textNode = {
+    i: node.i,
+    iEnd: null,
+    type: 'text',
+    data: '',
+  };
+
+  let parseFrom = 0;
+  for (const match of node.data.matchAll(/, */g)) {
+    const {index} = match, [{length}] = match;
+
+    textNode.data += node.data.slice(parseFrom, index);
+
+    if (textNode.data) {
+      textNode.iEnd = textNode.i + textNode.data.length;
+      yield textNode;
+
+      textNode = {
+        i: node.i + index + length,
+        iEnd: null,
+        type: 'text',
+        data: '',
+      };
+    }
+
+    yield {
+      i: node.i + index,
+      iEnd: node.i + index + length,
+      type: 'comma-separator',
+    };
+
+    parseFrom = index + length;
+  }
+
+  if (parseFrom !== node.data.length) {
+    textNode.data += node.data.slice(parseFrom);
+    textNode.iEnd = node.iEnd;
+  }
+
+  if (textNode.data) {
+    yield textNode;
+  }
+}
+
+function* splitTextNodesAroundCommas(nodes) {
+  for (const node of nodes) {
+    if (node.type === 'text' && node.data.includes(',')) {
+      yield* splitTextNodeAroundCommas(node);
+    } else {
+      yield node;
+    }
+  }
+}
+
 export default templateCompositeFrom({
   annotation: `withSourceText`,
 
@@ -16,60 +76,107 @@ export default templateCompositeFrom({
       output: input.value({'#sourceText': null}),
     }),
 
+    // Get the list of notes including custom comma-separator nodes,
+    // and do some basic processing to make details about this list
+    // available later.
+
     {
       dependencies: ['annotation'],
       compute: (continuation, {
         ['annotation']: annotation,
       }) => continuation({
-        ['#matches']:
-          Array.from(matchMarkdownLinks(annotation, {marked})),
+        ['#nodes']:
+          Array.from(
+            splitTextNodesAroundCommas(
+              parseInput(annotation))),
       }),
     },
 
-    raiseOutputWithoutDependency({
-      dependency: '#matches',
-      output: input.value({'#sourceText': null}),
-      mode: input.value('empty'),
+    withLengthOfList({
+      list: '#nodes',
+    }),
+
+    withMappedList({
+      list: '#nodes',
+      map: input.value(node => node.type === 'comma-separator'),
+    }).outputs({
+      '#mappedList': '#commaSeparatorFilter',
     }),
 
+    // Identify the first and last nodes in the range running from
+    // the first external link, up til (not including) the following
+    // comma separator.
+
     {
-      dependencies: ['#matches'],
+      dependencies: ['#nodes'],
       compute: (continuation, {
-        ['#matches']: matches,
-      }) =>
-        continuation({
-          ['#startIndex']:
-            matches.at(0).index,
-
-          ['#endIndex']:
-            matches.at(-1).index +
-            matches.at(-1).length,
-        }),
+        ['#nodes']: nodes,
+      }) => continuation({
+        ['#firstExternalLink']:
+          nodes.find(node => node.type === 'external-link'),
+      }),
     },
 
+    raiseOutputWithoutDependency({
+      dependency: '#firstExternalLink',
+      output: input.value({'#sourceText': null}),
+    }),
+
+    withNearbyItemFromList({
+      item: '#firstExternalLink',
+      list: '#nodes',
+      offset: input.value(+1),
+
+      filter: '#commaSeparatorFilter',
+    }).outputs({
+      '#nearbyItem': '#nextCommaSeparator',
+    }),
+
     {
-      dependencies: ['annotation', '#endIndex'],
+      dependencies: [
+        '#firstExternalLink',
+        '#nextCommaSeparator',
+        '#nodes',
+      ],
+
       compute: (continuation, {
-        ['annotation']: annotation,
-        ['#endIndex']: endIndex,
+        ['#firstExternalLink']: firstExternalLink,
+        ['#nextCommaSeparator']: nextCommaSeparator,
+        ['#nodes']: nodes,
       }) => continuation({
-        ['#rest']:
-          annotation.slice(endIndex)
-            .match(/^[^,]*(?=,|$)/),
+        ['#lastNodeInRange']:
+          (nextCommaSeparator
+            ? nodes.at(nodes.indexOf(nextCommaSeparator) - 1)
+            : nodes.at(-1)),
       }),
     },
 
+    // Extract the content text covered by that range.
+
+    withPropertyFromObject({
+      object: '#firstExternalLink',
+      property: input.value('i'),
+    }),
+
+    withPropertyFromObject({
+      object: '#lastNodeInRange',
+      property: input.value('iEnd'),
+    }),
+
     {
-      dependencies: ['annotation', '#startIndex', '#endIndex', '#rest'],
+      dependencies: [
+        '#firstExternalLink.i',
+        '#lastNodeInRange.iEnd',
+        'annotation',
+      ],
+
       compute: (continuation, {
+        ['#firstExternalLink.i']: i,
+        ['#lastNodeInRange.iEnd']: iEnd,
         ['annotation']: annotation,
-        ['#startIndex']: startIndex,
-        ['#endIndex']: endIndex,
-        ['#rest']: rest,
       }) => continuation({
         ['#sourceText']:
-          annotation.slice(startIndex, startIndex + endIndex) +
-          rest,
+          annotation.slice(i, iEnd),
       }),
     },
   ],
author	(quasar) nebula <qznebula@protonmail.com>	2025-05-01 14:01:01 -0300
committer	(quasar) nebula <qznebula@protonmail.com>	2025-05-06 12:29:05 -0300
commit	57dd7dbdafba63b6edbd330b98072f09957a5492 (patch)
tree	529779314f34e53ba116422fe68a1fa97c5e9cc8 /src/data/composite/things/content/withSourceText.js
parent	9f14f1dfc7aa6c00c0cfa07577208ad1bdcc62f7 (diff)