1 files changed, 57 insertions, 19 deletions
diff --git a/src/common-util/wiki-data.js b/src/common-util/wiki-data.js
index 546f1ad9..1668f110 100644
--- a/src/common-util/wiki-data.js
+++ b/src/common-util/wiki-data.js
@@ -11,7 +11,7 @@ export {filterMultipleArrays} from './sugar.js';
 
 // Generic value operations
 
-export function getKebabCase(name) {
+export function getCaseSensitiveKebabCase(name) {
   return name
 
     // Spaces to dashes
@@ -34,6 +34,9 @@ export function getKebabCase(name) {
     // General punctuation which always separates surrounding words
     .replace(/[/@#$%*()_=,[\]{}|\\;:<>?`~]/g, '-')
 
+    // More punctuation which always separates surrounding words
+    .replace(/[\u{2013}-\u{2014}]/u, '-') // En Dash, Em Dash
+
     // Accented characters
     .replace(/[áâäàå]/gi, 'a')
     .replace(/[çč]/gi, 'c')
@@ -50,9 +53,10 @@ export function getKebabCase(name) {
 
     // Trim dashes on boundaries
     .replace(/^-+|-+$/g, '')
+}
 
-    // Always lowercase
-    .toLowerCase();
+export function getKebabCase(name) {
+  return getCaseSensitiveKebabCase(name).toLowerCase();
 }
 
 // Specific data utilities
@@ -102,6 +106,8 @@ export const commentaryRegexCaseSensitive =
 export const commentaryRegexCaseSensitiveOneShot =
   new RegExp(commentaryRegexRaw);
 
+export const languageOptionRegex = /{(?<name>[A-Z0-9_]+)}/g;
+
 // The #validators function isOldStyleLyrics() describes
 // what this regular expression detects against.
 export const multipleLyricsDetectionRegex =
@@ -113,10 +119,16 @@ export function matchContentEntries(sourceText) {
   let previousMatchEntry = null;
   let previousEndIndex = null;
 
+  const trimBody = body =>
+    body
+      .replace(/^\n*/, '')
+      .replace(/\n*$/, '');
+
   for (const {0: matchText, index: startIndex, groups: matchEntry}
           of sourceText.matchAll(commentaryRegexCaseSensitive)) {
     if (previousMatchEntry) {
-      previousMatchEntry.body = sourceText.slice(previousEndIndex, startIndex);
+      previousMatchEntry.body =
+        trimBody(sourceText.slice(previousEndIndex, startIndex));
     }
 
     matchEntries.push(matchEntry);
@@ -126,7 +138,8 @@ export function matchContentEntries(sourceText) {
   }
 
   if (previousMatchEntry) {
-    previousMatchEntry.body = sourceText.slice(previousEndIndex);
+    previousMatchEntry.body =
+      trimBody(sourceText.slice(previousEndIndex));
   }
 
   return matchEntries;
@@ -526,26 +539,51 @@ export function combineWikiDataArrays(arrays) {
 // Markdown stuff
 
 export function* matchMarkdownLinks(markdownSource, {marked}) {
-  const plausibleLinkRegexp = /\[.*?\)/g;
+  const plausibleLinkRegexp = /\[(?=.*?\))/g;
+
+  const lexer = new marked.Lexer();
+
+  // This is just an optimization. Don't let Marked try to process tokens
+  // recursively, i.e. within the text/label of the link. We only care about
+  // the text itself, as a string.
+  lexer.inlineTokens = x => [];
+
+  // This is cheating, because the lexer's tokenizer is a private property,
+  // but we can apparently access it anyway.
+  const {tokenizer} = lexer;
 
   let plausibleMatch = null;
   while (plausibleMatch = plausibleLinkRegexp.exec(markdownSource)) {
-    // Pedantic rules use more particular parentheses detection in link
-    // destinations - they allow one level of balanced parentheses, and
-    // otherwise, parentheses must be escaped. This allows for entire links
-    // to be wrapped in parentheses, e.g below:
-    //
-    //   This is so cool. ([You know??](https://example.com))
-    //
+    const {index} = plausibleMatch;
+
     const definiteMatch =
-      marked.Lexer.rules.inline.pedantic.link
-        .exec(markdownSource.slice(plausibleMatch.index));
+      tokenizer.link(markdownSource.slice(index));
+
+    if (!definiteMatch) {
+      continue;
+    }
 
-    if (definiteMatch) {
-      const [{length}, label, href] = definiteMatch;
-      const index = plausibleMatch.index + definiteMatch.index;
+    const {raw: {length}, text: label, href} = definiteMatch;
 
-      yield {label, href, index, length};
+    yield {label, href, index, length};
+  }
+}
+
+export function* matchInlineLinks(source) {
+  const plausibleLinkRegexp = /\b[a-z]*:\/\/[^ ]*?(?=(?:[,.!?]*)(?:\s|$))/gm;
+
+  let plausibleMatch = null;
+  while (plausibleMatch = plausibleLinkRegexp.exec(source)) {
+    const [href] = plausibleMatch;
+    const {index} = plausibleMatch;
+    const [{length}] = plausibleMatch;
+
+    try {
+      new URL(href);
+    } catch {
+      continue;
     }
+
+    yield {href, length, index};
   }
 }