diff options
Diffstat (limited to 'src/common-util')
| -rw-r--r-- | src/common-util/wiki-data.js | 113 |
1 files changed, 60 insertions, 53 deletions
diff --git a/src/common-util/wiki-data.js b/src/common-util/wiki-data.js index 36392fcb..e92a80d5 100644 --- a/src/common-util/wiki-data.js +++ b/src/common-util/wiki-data.js @@ -66,77 +66,84 @@ export function compareKebabCase(name1, name2) { // Specific data utilities -// Matches heading details from commentary data in roughly the format: +// Ostensibly supports these strings. Maybe. // -// <i>artistText:</i> (annotation, date) +// * "25 December 2019" - one or two number digits, followed by any text, +// followed by four number digits +// * "December 25, 2019" - one all-letters word, a space, one or two number +// digits, a comma, and four number digits +// * "12/25/2019" etc - three sets of one to four number digits, separated +// by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD) // -// where capturing group "annotation" can be any text at all, except that the -// last entry (past a comma or the only content within parentheses), if parsed -// as a date, is the capturing group "date". "Parsing as a date" means matching -// one of these formats: -// -// * "25 December 2019" - one or two number digits, followed by any text, -// followed by four number digits -// * "December 25, 2019" - one all-letters word, a space, one or two number -// digits, a comma, and four number digits -// * "12/25/2019" etc - three sets of one to four number digits, separated -// by slashes or dashes (only valid orders are MM/DD/YYYY and YYYY/MM/DD) -// -// Note that the annotation and date are always wrapped by one opening and one -// closing parentheses. The whole heading does NOT need to match the entire -// line it occupies (though it does always start at the first position on that -// line), and if there is more than one closing parenthesis on the line, the -// annotation will always cut off only at the last parenthesis, or a comma -// preceding a date and then the last parenthesis. This is to ensure that -// parentheses can be part of the actual annotation content. -// -// Capturing group "artistReference" is all the characters between <i> and </i> -// (apart from the pipe and the "artistText" group, if present), and is either -// the name of one or more artist or "artist:directory"-style references, -// joined by commas, if multiple. -// -// This regular expression *doesn't* match bodies, which will need to be parsed -// out of the original string based on the indices matched using this. -// - const dateRegex = groupName => - String.raw`(?<${groupName}>[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|[0-9]{1,2} [^,]*[0-9]{4,4}|[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4})`; + String.raw`(?<${groupName}>` + + String.raw`[a-zA-Z]+ [0-9]{1,2}, [0-9]{4,4}|` + + String.raw`[0-9]{1,2} [^,]*[0-9]{4,4}|` + + String.raw`[0-9]{1,4}[-/][0-9]{1,4}[-/][0-9]{1,4}` + + String.raw`)`; + +const contentEntryHeadingRegexRaw = + String.raw`^<i>(?<artists>.+?):<\/i>(?: \((?<annotation>.*)\))?$`; + +const contentEntryHeadingRegex = + new RegExp(contentEntryHeadingRegexRaw, 'gm'); + +const contentEntryAnnotationTailRegexRaw = + String.raw`(?:, |^)` + + + String.raw`(?:(?<dateKind>sometime|throughout|around) )?` + + String.raw`${dateRegex('date')}` + + String.raw`(?: ?- ?${dateRegex('secondDate')})?` + -const commentaryRegexRaw = - String.raw`^<i>(?<artistText>.+?):<\/i>(?: \((?<annotation>(?:.*?(?=,|\)[^)]*$))*?)(?:,? ?(?:(?<dateKind>sometime|throughout|around) )?${dateRegex('date')}(?: ?- ?${dateRegex('secondDate')})?(?: (?<accessKind>captured|accessed) ${dateRegex('accessDate')})?)?\))?`; -const commentaryRegex = - new RegExp(commentaryRegexRaw, 'gm'); + String.raw`(?: ?(?<= )` + + String.raw`(?<accessKind>captured|accessed) ${dateRegex('accessDate')}` + + String.raw`)?` + -export function matchContentEntries(sourceText) { - const matchEntries = []; + String.raw`$`; - let previousMatchEntry = null; - let previousEndIndex = null; +const contentEntryAnnotationTailRegex = + new RegExp(contentEntryAnnotationTailRegexRaw); + +export function* matchContentEntries(sourceText) { + let workingEntry = null; + let workingBodyIndex = null; const trimBody = body => body .replace(/^\n*/, '') .replace(/\n*$/, ''); - for (const {0: matchText, index: startIndex, groups: matchEntry} - of sourceText.matchAll(commentaryRegex)) { - if (previousMatchEntry) { - previousMatchEntry.body = - trimBody(sourceText.slice(previousEndIndex, startIndex)); + for (const headingMatch of sourceText.matchAll(contentEntryHeadingRegex)) { + if (workingEntry) { + workingEntry.body = + trimBody(sourceText.slice(workingBodyIndex, headingMatch.index)); + + yield workingEntry; } - matchEntries.push(matchEntry); + workingEntry = {...headingMatch.groups}; - previousMatchEntry = matchEntry; - previousEndIndex = startIndex + matchText.length; - } + if (workingEntry.annotation) { + const annotationTailMatch = + workingEntry.annotation.match(contentEntryAnnotationTailRegex); - if (previousMatchEntry) { - previousMatchEntry.body = - trimBody(sourceText.slice(previousEndIndex)); + if (annotationTailMatch) { + workingEntry.annotation = + workingEntry.annotation.slice(0, annotationTailMatch.index); + + Object.assign(workingEntry, annotationTailMatch.groups); + } + } + + workingBodyIndex = headingMatch.index + headingMatch[0].length; } - return matchEntries; + if (workingEntry) { + workingEntry.body = + trimBody(sourceText.slice(workingBodyIndex)); + + yield workingEntry; + } } export function filterAlbumsByCommentary(albums) { |