first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,82 @@
# Lezer-LaTeX, a LaTeX Parser
Lezer-LaTeX is a LaTeX parser implemented with [lezer](https://lezer.codemirror.net/), the parser system used by [CodeMirror 6](https://codemirror.net/6/).
The parser is written in a "grammar" file, (and a "tokens" file with custom tokenizer logic) which is then compiled by `@lezer/generator` into a parser module and a "terms" module. The parser module is then loaded by the CodeMirror 6 in the web frontend codebase.
## Important files
- Source files:
- `./latex.grammar`: The grammar file, containing the specification for the parser
- `./tokens.mjs`: The custom tokenizer logic, required by some rules in the grammar
- Generated files:
- `./latex.mjs`: The generated parser
- `./latex.terms.mjs`: The generated terms file
- (these files are ignored by git, eslint, and prettier)
- Scripts:
- `web/scripts/lezer-latex/generate.js`: A script which runs the generator on the grammar, producing the generated parser/terms files
- `web/scripts/lezer-latex/run.mjs`: A script that runs the parser against a supplied file, and prints the tree to the terminal
- Webpack plugins:
- `web/webpack-plugins/lezer-grammar-compiler.js`: A webpack plugin that calls the generator as part of the webpack build. In dev, it will automatically re-build the parser when the grammar file changes.
## NPM tasks
- `lezer-latex:generate`: Generate the parser files from the grammar
- (Calls `lezer-latex/generate.js`)
- This should be run whenever the grammar changes
- `lezer-latex:run`: Run the parser against a file
- (Calls `lezer-latex/run.js`)
### Generating the parser
From the monorepo root:
``` sh
bin/npm -w services/web run 'lezer-latex:generate'
```
## Tests
Unit tests for the parser live in `web/test/unit/src/LezerLatex`. There are three kinds of test, in three subdirectories:
- `corpus/`: A set of tests using lezer's test framework, consisting of example text and the expected parse tree
- `examples/`: A set of realistic LaTeX documents. These tests pass if the files parse with no errors
- `regressions/`: Like `examples/`, these are expected to parse without error, but they are not realistic documents.
These tests run as part of `test_frontend`. You can run these tests alone by invoking:
``` sh
make test_unit MOCHA_GREP='lezer-latex'
```
## Trying the parser
While developing the parser, you can run it against a file by calling the `lezer-latex:run` task. There are
some example files in the test suite, at `web/test/unit/src/LezerLatex/examples/`.
For example:
``` sh
bin/npm -w services/web run 'lezer-latex:run' web/test/unit/src/LezerLatex/examples/amsmath.tex
```
If you omit the file path, the default file (`examples/demo.tex`) will be run.
## Integration into web
The web frontend imports the parser (from `latex.mjs`), in `frontend/js/features/source-editor/languages/latex/index.ts`.
The parser is then plugged in to the CM6 language system.
### The web build
In `web/Dockerfile`, we have a `RUN` command that calls `lezer-latex:generate` as part of the build. This is necessary to ensure the parser is built before the CI tests run (notably: we can't do the build during the tests, because we can't write to disk during that stage of CI).

View File

@@ -0,0 +1,829 @@
// Track environments
@context elementContext from "./tokens.mjs"
// External tokens must be defined before normal @tokens to take precedence
// over them.
@external tokens verbTokenizer from "./tokens.mjs" {
VerbContent
}
@external tokens lstinlineTokenizer from "./tokens.mjs" {
LstInlineContent
}
@external tokens literalArgTokenizer from "./tokens.mjs" {
LiteralArgContent
}
@external tokens spaceDelimitedLiteralArgTokenizer from "./tokens.mjs" {
SpaceDelimitedLiteralArgContent
}
@external tokens verbatimTokenizer from "./tokens.mjs" {
VerbatimContent
}
// external tokenizer to read control sequence names including @ signs
// (which are often used in TeX definitions).
@external tokens csnameTokenizer from "./tokens.mjs" {
Csname
}
@external tokens trailingContentTokenizer from "./tokens.mjs" {
TrailingWhitespaceOnly,
TrailingContent
}
// It doesn't seem to be possible to access specialized tokens in the context tracker.
// They have id's which are not exported in the latex.terms.js file.
// This is a workaround: use an external specializer to explicitly choose the terms
// to use for the specialized tokens.
@external specialize {CtrlSeq} specializeCtrlSeq from "./tokens.mjs" {
Begin,
End,
RefCtrlSeq,
RefStarrableCtrlSeq,
CiteCtrlSeq,
CiteStarrableCtrlSeq,
LabelCtrlSeq,
MathTextCtrlSeq,
HboxCtrlSeq,
TitleCtrlSeq,
DocumentClassCtrlSeq,
UsePackageCtrlSeq,
HrefCtrlSeq,
UrlCtrlSeq,
VerbCtrlSeq,
LstInlineCtrlSeq,
IncludeGraphicsCtrlSeq,
CaptionCtrlSeq,
DefCtrlSeq,
LetCtrlSeq,
LeftCtrlSeq,
RightCtrlSeq,
NewCommandCtrlSeq,
RenewCommandCtrlSeq,
NewEnvironmentCtrlSeq,
RenewEnvironmentCtrlSeq,
// services/web/frontend/js/features/outline/outline-parser.js
BookCtrlSeq,
PartCtrlSeq,
ChapterCtrlSeq,
SectionCtrlSeq,
SubSectionCtrlSeq,
SubSubSectionCtrlSeq,
ParagraphCtrlSeq,
SubParagraphCtrlSeq,
InputCtrlSeq,
IncludeCtrlSeq,
ItemCtrlSeq,
NewTheoremCtrlSeq,
TheoremStyleCtrlSeq,
CenteringCtrlSeq,
BibliographyCtrlSeq,
BibliographyStyleCtrlSeq,
AuthorCtrlSeq,
AffilCtrlSeq,
AffiliationCtrlSeq,
DateCtrlSeq,
MaketitleCtrlSeq,
TextColorCtrlSeq,
ColorBoxCtrlSeq,
HLineCtrlSeq,
TopRuleCtrlSeq,
MidRuleCtrlSeq,
BottomRuleCtrlSeq,
MultiColumnCtrlSeq,
ParBoxCtrlSeq,
TextBoldCtrlSeq,
TextItalicCtrlSeq,
TextSmallCapsCtrlSeq,
TextTeletypeCtrlSeq,
TextMediumCtrlSeq,
TextSansSerifCtrlSeq,
TextSuperscriptCtrlSeq,
TextSubscriptCtrlSeq,
TextStrikeOutCtrlSeq,
EmphasisCtrlSeq,
UnderlineCtrlSeq,
SetLengthCtrlSeq
}
@external specialize {EnvName} specializeEnvName from "./tokens.mjs" {
DocumentEnvName,
TabularEnvName,
EquationEnvName,
EquationArrayEnvName,
VerbatimEnvName,
TikzPictureEnvName,
FigureEnvName,
ListEnvName,
TableEnvName
}
@external specialize {CtrlSym} specializeCtrlSym from "./tokens.mjs" {
OpenParenCtrlSym,
CloseParenCtrlSym,
OpenBracketCtrlSym,
CloseBracketCtrlSym,
LineBreakCtrlSym
}
@tokens {
CtrlSeq { "\\" $[a-zA-Z]+ }
CtrlSym { "\\" ![a-zA-Z] }
// tokens for paragraphs
Whitespace { $[ \t]+ }
NewLine { "\n" }
BlankLine { "\n" "\n"+ }
Normal { ![\\{}\[\]$&~#^_% \t\n] ![\\{}\[\]$&~#^_%\t\n]* } // everything is normal text, except these characters
@precedence { CtrlSeq, CtrlSym, BlankLine, NewLine, Whitespace, Normal }
OpenBrace[closedBy=CloseBrace] { "{" }
CloseBrace[openedBy=OpenBrace] { "}" }
OpenBracket[closedBy=CloseBracket] { "[" }
CloseBracket[openedBy=OpenBracket] { "]" }
Comment { "%" ![\n]* "\n"? }
Dollar { "$" }
Number { $[0-9]+ ("." $[0-9]*)? }
MathSpecialChar { $[^_=<>()\-+/*]+ } // FIXME not all of these are special
MathChar { ![0-9^_=<>()\-+/*\\{}\[\]$%&~ \t\n]+ }
@precedence { Number, MathSpecialChar, MathChar }
Ampersand { "&" }
Tilde { "~" }
EnvName { $[a-zA-Z]+ $[*]? }
}
@top LaTeX {
Text
}
@skip { Comment }
// TEXT MODE
optionalWhitespace {
!argument Whitespace
}
OptionalArgument {
!argument OpenBracket ShortOptionalArg CloseBracket
}
TextArgument {
!argument OpenBrace LongArg CloseBrace
}
SectioningArgument {
!argument OpenBrace LongArg CloseBrace
}
LabelArgument {
!argument ShortTextArgument
}
RefArgument {
!argument ShortTextArgument
}
BibKeyArgument {
!argument ShortTextArgument
}
PackageArgument {
!argument ShortTextArgument
}
TabularArgument {
!argument OpenBrace TabularContent CloseBrace
}
UrlArgument {
OpenBrace LiteralArgContent CloseBrace
}
FilePathArgument {
OpenBrace LiteralArgContent CloseBrace
}
BareFilePathArgument {
Whitespace SpaceDelimitedLiteralArgContent
}
DefinitionArgument {
!argument NewLine? Whitespace* OpenBrace DefinitionFragment? CloseBrace
}
MacroParameter {
"#" ("1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9")
}
OptionalMacroParameter {
OpenBracket MacroParameter CloseBracket
}
// The autocompletion code in services/web/frontend/js/features/source-editor/utils/tree-operations/commands.ts
// depends on following the `KnownCommand { Command { CommandCtrlSeq [args] } }`
// structure
KnownCommand<ArgumentType> {
Title {
TitleCtrlSeq optionalWhitespace? OptionalArgument? TextArgument
} |
Author {
AuthorCtrlSeq optionalWhitespace? OptionalArgument? optionalWhitespace? TextArgument
} |
Affil {
AffilCtrlSeq optionalWhitespace? OptionalArgument? optionalWhitespace? TextArgument
} |
Affiliation {
AffiliationCtrlSeq optionalWhitespace? OptionalArgument? optionalWhitespace? TextArgument
} |
Date {
DateCtrlSeq optionalWhitespace? OptionalArgument? optionalWhitespace? ShortTextArgument
} |
DocumentClass {
DocumentClassCtrlSeq optionalWhitespace? OptionalArgument?
DocumentClassArgument { ShortTextArgument }
} |
BibliographyCommand {
BibliographyCtrlSeq optionalWhitespace?
BibliographyArgument { ShortTextArgument }
} |
BibliographyStyleCommand {
BibliographyStyleCtrlSeq optionalWhitespace?
BibliographyStyleArgument { ShortTextArgument }
} |
UsePackage {
UsePackageCtrlSeq optionalWhitespace? OptionalArgument?
PackageArgument
} |
TextColorCommand {
TextColorCtrlSeq optionalWhitespace? ShortTextArgument optionalWhitespace? ArgumentType
} |
ColorBoxCommand {
ColorBoxCtrlSeq optionalWhitespace? ShortTextArgument optionalWhitespace? ArgumentType
} |
HrefCommand {
HrefCtrlSeq optionalWhitespace? UrlArgument ShortTextArgument
} |
NewTheoremCommand {
NewTheoremCtrlSeq "*"? optionalWhitespace? ShortTextArgument ((OptionalArgument? TextArgument) | (TextArgument OptionalArgument))
} |
TheoremStyleCommand {
TheoremStyleCtrlSeq optionalWhitespace? ShortTextArgument
} |
UrlCommand {
UrlCtrlSeq optionalWhitespace? UrlArgument
} |
VerbCommand {
VerbCtrlSeq VerbContent
} |
LstInlineCommand {
LstInlineCtrlSeq optionalWhitespace? OptionalArgument? LstInlineContent
} |
IncludeGraphics {
IncludeGraphicsCtrlSeq optionalWhitespace? OptionalArgument?
IncludeGraphicsArgument { FilePathArgument }
} |
Caption {
CaptionCtrlSeq "*"? optionalWhitespace? OptionalArgument? TextArgument
} |
Label {
LabelCtrlSeq optionalWhitespace? LabelArgument
} |
Ref {
(RefCtrlSeq | RefStarrableCtrlSeq "*"?) optionalWhitespace? OptionalArgument? optionalWhitespace? OptionalArgument? optionalWhitespace? RefArgument
} |
Cite {
(CiteCtrlSeq | CiteStarrableCtrlSeq "*"?) optionalWhitespace? OptionalArgument? optionalWhitespace? OptionalArgument? optionalWhitespace? BibKeyArgument
} |
Def {
// allow more general Csname argument to \def commands, since other symbols such as '@' are often used in definitions
DefCtrlSeq optionalWhitespace? (Csname | CtrlSym) optionalWhitespace? (MacroParameter | OptionalMacroParameter)* optionalWhitespace? DefinitionArgument
} |
Let {
LetCtrlSeq Csname optionalWhitespace? "="? optionalWhitespace? Csname
} |
Hbox {
HboxCtrlSeq optionalWhitespace? TextArgument
} |
NewCommand {
NewCommandCtrlSeq optionalWhitespace?
(Csname | OpenBrace LiteralArgContent CloseBrace)
(OptionalArgument)*
DefinitionArgument
} |
RenewCommand {
RenewCommandCtrlSeq optionalWhitespace?
(Csname | OpenBrace LiteralArgContent CloseBrace)
(OptionalArgument)*
DefinitionArgument
} |
NewEnvironment {
NewEnvironmentCtrlSeq optionalWhitespace?
(OpenBrace LiteralArgContent CloseBrace)
(OptionalArgument)*
DefinitionArgument
DefinitionArgument
} |
RenewEnvironment {
RenewEnvironmentCtrlSeq optionalWhitespace?
(Csname | OpenBrace LiteralArgContent CloseBrace)
(OptionalArgument)*
DefinitionArgument
DefinitionArgument
} |
Input {
InputCtrlSeq InputArgument { ( FilePathArgument | BareFilePathArgument ) }
} |
Include {
IncludeCtrlSeq IncludeArgument { FilePathArgument }
} |
Centering {
CenteringCtrlSeq
} |
Item {
ItemCtrlSeq OptionalArgument? optionalWhitespace?
} |
Maketitle {
MaketitleCtrlSeq optionalWhitespace?
} |
HorizontalLine {
(HLineCtrlSeq | TopRuleCtrlSeq | MidRuleCtrlSeq | BottomRuleCtrlSeq) optionalWhitespace?
} |
MultiColumn {
MultiColumnCtrlSeq
optionalWhitespace? SpanArgument { ShortTextArgument }
optionalWhitespace? ColumnArgument { ShortTextArgument }
optionalWhitespace? TabularArgument
} |
MathTextCommand {
MathTextCtrlSeq optionalWhitespace? "*"? TextArgument
} |
ParBoxCommand {
ParBoxCtrlSeq
(optionalWhitespace? OptionalArgument)*
ShortTextArgument
optionalWhitespace? TextArgument
} |
TextBoldCommand {
TextBoldCtrlSeq TextArgument
} |
TextItalicCommand {
TextItalicCtrlSeq TextArgument
} |
TextSmallCapsCommand {
TextSmallCapsCtrlSeq TextArgument
} |
TextTeletypeCommand {
TextTeletypeCtrlSeq TextArgument
} |
TextMediumCommand {
TextMediumCtrlSeq TextArgument
} |
TextSansSerifCommand {
TextSansSerifCtrlSeq TextArgument
} |
TextSuperscriptCommand {
TextSuperscriptCtrlSeq TextArgument
} |
TextSubscriptCommand {
TextSubscriptCtrlSeq TextArgument
} |
StrikeOutCommand {
TextStrikeOutCtrlSeq ArgumentType
} |
EmphasisCommand {
EmphasisCtrlSeq ArgumentType
} |
UnderlineCommand {
UnderlineCtrlSeq ArgumentType
} |
SetLengthCommand {
SetLengthCtrlSeq optionalWhitespace? ShortTextArgument optionalWhitespace? ShortTextArgument
}
}
UnknownCommand {
(CtrlSeq !argument Whitespace (OptionalArgument | TextArgument)+)
| (CtrlSeq (OptionalArgument | TextArgument)+)
| CtrlSeq Whitespace?
| CtrlSym
}
Command {
KnownCommand<TextArgument>
| UnknownCommand
| KnownCtrlSym
// Not technically allowed in normal mode, but not worth failing the parse over
| LeftCtrlSeq
| RightCtrlSeq
}
KnownCtrlSym {
LineBreak {
LineBreakCtrlSym OptionalArgument?
}
}
textBase {
( Command
| DollarMath
| BracketMath
| ParenMath
| NewLine
| Normal
| Whitespace
| Ampersand
| Tilde
)
}
textWithBrackets {
( textBase
| OpenBracket
| CloseBracket
)
}
textWithEnvironmentsAndBlankLines {
( BlankLine
| KnownEnvironment
| Environment
| textWithBrackets
)
}
textWithGroupsEnvironmentsAndBlankLines {
textWithEnvironmentsAndBlankLines
| Group<Text>
}
Content<Element> {
Element
}
SectioningCommand<Command> {
Command optionalWhitespace? "*"? optionalWhitespace? OptionalArgument? optionalWhitespace? SectioningArgument
}
documentSection<Command, Next> {
SectioningCommand<Command> Content<(sectionText | !section Next)*>
}
Book[@isGroup="$Section"] { documentSection<BookCtrlSeq, Part | Chapter | Section | SubSection | SubSubSection | Paragraph | SubParagraph> }
Part[@isGroup="$Section"] { documentSection<PartCtrlSeq, Chapter | Section | SubSection | SubSubSection | Paragraph | SubParagraph> }
Chapter[@isGroup="$Section"] { documentSection<ChapterCtrlSeq, Section | SubSection | SubSubSection | Paragraph | SubParagraph> }
Section[@isGroup="$Section"] { documentSection<SectionCtrlSeq, SubSection | SubSubSection | Paragraph | SubParagraph> }
SubSection[@isGroup="$Section"] { documentSection<SubSectionCtrlSeq, SubSubSection | Paragraph | SubParagraph> }
SubSubSection[@isGroup="$Section"] { documentSection<SubSubSectionCtrlSeq, Paragraph | SubParagraph> }
Paragraph[@isGroup="$Section"] { documentSection<ParagraphCtrlSeq, SubParagraph> }
SubParagraph[@isGroup="$Section"] { SectioningCommand<SubParagraphCtrlSeq> Content<sectionText*> }
sectioningCommand {
Book | Part | Chapter | Section | SubSection | SubSubSection | Paragraph | SubParagraph
}
sectionText {
!section (
textWithGroupsEnvironmentsAndBlankLines
)+
}
Text {
( sectionText
| sectioningCommand)+
}
LongArg {
( textWithBrackets
| NonEmptyGroup<LongArg>
| KnownEnvironment
| Environment
| BlankLine
| "#" // macro character
| "_" | "^" // other math chars
)*
}
ShortTextArgument {
OpenBrace ShortArg CloseBrace
}
ShortArg {
( textWithBrackets
| NonEmptyGroup<ShortArg>
| "#" // macro character
| "_" | "^" // other math chars
)*
}
ShortOptionalArg {
( textBase
| NonEmptyGroup<ShortOptionalArg>
| "#" // macro character
| "_" // underscore is used in some parameter names
)*
}
TikzPictureContent { /// same as Text but with added allowed characters
( textWithEnvironmentsAndBlankLines
| NonEmptyGroup<TikzPictureContent>
| "#" // macro character
| "_" | "^" // other math chars
)+
}
DefinitionFragment {
( DefinitionFragmentCommand
| Begin
| End
| Group<DefinitionFragment>
| Dollar
| OpenParenCtrlSym
| CloseParenCtrlSym
| OpenBracketCtrlSym
| CloseBracketCtrlSym
| LeftCtrlSeq
| RightCtrlSeq
| BlankLine
| NewLine
| Normal
| Whitespace
| OpenBracket
| CloseBracket
| "#" // macro character
| Ampersand // for tables
| Tilde // unbreakable space
| "_" | "^" // other math chars
| SectioningCommand<
BookCtrlSeq |
PartCtrlSeq |
ChapterCtrlSeq |
SectionCtrlSeq |
SubSectionCtrlSeq |
SubSubSectionCtrlSeq |
ParagraphCtrlSeq |
SubParagraphCtrlSeq
>
)+
}
DefinitionFragmentArgument {
OpenBrace DefinitionFragment? CloseBrace
}
DefinitionFragmentCommand {
KnownCommand<TextArgument>
| DefinitionFragmentUnknownCommand { genericUnknownCommandWithOptionalArguments<DefinitionFragmentArgument, OptionalArgument> }
| KnownCtrlSym
}
KnownEnvironment {
( DocumentEnvironment
| TabularEnvironment
| EquationEnvironment
| EquationArrayEnvironment
| VerbatimEnvironment
| TikzPictureEnvironment
| FigureEnvironment
| ListEnvironment
| TableEnvironment
)
}
BeginEnv<name> {
Begin
EnvNameGroup<name>
OptionalArgument?
(!argument TextArgument)*
}
EndEnv<name> {
End
EnvNameGroup<name>
}
DocumentEnvironment[@isGroup="$Environment"] {
BeginEnv<DocumentEnvName>
Content<Text>
EndEnv<DocumentEnvName>
(TrailingWhitespaceOnly | TrailingContent)?
}
TabularContent {
(textWithGroupsEnvironmentsAndBlankLines)*
}
TabularEnvironment[@isGroup="$Environment"] {
BeginEnv<TabularEnvName>
Content<TabularContent>
EndEnv<TabularEnvName>
}
TableEnvironment[@isGroup="$Environment"] {
BeginEnv<TableEnvName>
Content<Text>
EndEnv<TableEnvName>
}
EquationEnvironment[@isGroup="$Environment"] {
BeginEnv<EquationEnvName>
Content<Math?>
EndEnv<EquationEnvName>
}
EquationArrayEnvironment[@isGroup="$Environment"] {
BeginEnv<EquationArrayEnvName>
Content<Math?>
EndEnv<EquationArrayEnvName>
}
VerbatimEnvironment[@isGroup="$Environment"] {
BeginEnv<VerbatimEnvName>
Content<VerbatimContent>
EndEnv<VerbatimEnvName>
}
TikzPictureEnvironment[@isGroup="$Environment"] {
BeginEnv<TikzPictureEnvName>
Content<TikzPictureContent>
EndEnv<TikzPictureEnvName>
}
FigureEnvironment[@isGroup="$Environment"] {
BeginEnv<FigureEnvName>
Content<Text>
EndEnv<FigureEnvName>
}
ListEnvironment[@isGroup="$Environment"] {
BeginEnv<ListEnvName>
Content<Text>
EndEnv<ListEnvName>
}
EnvNameGroup<name> {
OpenBrace name CloseBrace
}
Environment[@isGroup="$Environment"] {
BeginEnv<EnvName?>
Content<Text>
EndEnv<EnvName?>
}
Group<GroupContent> {
OpenBrace GroupContent? CloseBrace
}
NonEmptyGroup<GroupContent> {
OpenBrace GroupContent CloseBrace
}
/// MATH MODE
DollarMath[@isGroup="$MathContainer"] {
Dollar (InlineMath | DisplayMath) Dollar
}
InlineMath {
Math
}
DisplayMath {
Dollar Math? Dollar
}
OpenParenMath[closedBy=CloseParenMath] {
OpenParenCtrlSym
}
CloseParenMath[openedBy=OpenParenMath] {
CloseParenCtrlSym
}
// alternative syntax \( math \) for inline math, it is the same as $ math $
ParenMath[@isGroup="$MathContainer"] {
OpenParenMath
Math?
CloseParenMath
}
OpenBracketMath[closedBy=CloseBracketMath] {
OpenBracketCtrlSym
}
CloseBracketMath[openedBy=OpenBracketMath] {
CloseBracketCtrlSym
}
// alternative syntax \[ math \] for display math, it is the same as $$ math $$
BracketMath[@isGroup="$MathContainer"] {
OpenBracketMath
Math?
CloseBracketMath
}
// FIXME: we should have separate math modes for inline and display math,
// because display math can contain blank lines while inline math cannot.
Math {
( MathCommand
| Group<Math>
| MathDelimitedGroup
| MathSpecialChar
| Number
| NewLine
| Whitespace
| KnownEnvironment
| Environment
| MathChar
| OpenBracket
| CloseBracket
| Ampersand
| Tilde
)+
}
MathCommand {
KnownCommand<MathArgument>
| MathUnknownCommand { genericUnknownCommand<MathArgument> }
| KnownCtrlSym
}
@external tokens argumentListTokenizer from "./tokens.mjs" {
hasMoreArguments,
endOfArguments
}
@external tokens argumentListWithOptionalTokenizer from "./tokens.mjs" {
hasMoreArgumentsOrOptionals,
endOfArgumentsAndOptionals
}
genericUnknownCommand<ArgumentType> {
CtrlSeq (hasMoreArguments optionalWhitespace? ArgumentType)* endOfArguments
| CtrlSym
}
genericUnknownCommandWithOptionalArguments<ArgumentType, OptionalArgumentType> {
CtrlSeq (hasMoreArgumentsOrOptionals optionalWhitespace? (ArgumentType | OptionalArgumentType))* endOfArgumentsAndOptionals
| CtrlSym
}
MathArgument {
OpenBrace Math? CloseBrace
}
MathDelimitedGroup {
MathOpening Math? MathClosing
}
// FIXME: we have the same problem with specialize on \left,\right as the delimiters
MathOpening {
LeftCtrlSeq optionalWhitespace? MathDelimiter
}
MathClosing {
RightCtrlSeq optionalWhitespace? MathDelimiter
}
MathDelimiter {
// Allowed delimiters, from the LaTeX manual, table 3.10
"/" | "|" | "(" | ")" | "[" | "]" |
"\\{" | "\\}" | "\\|" |
"\\lfloor" | "\\rfloor" |
"\\lceil" | "\\rceil" |
"\\langle" | "\\rangle" |
"\\backslash" | "\\uparrow" |
"\\Uparrow" | "\\Downarrow" |
"\\updownarrow" | "\\Updownarrow" |
"\\downarrow" | "\\lvert" |
"\\lVert" | "\\rVert" |
"\\rvert" | "\\vert" | "\\Vert" |
"\\lbrace" | "\\rbrace" |
"\\lbrack" | "\\rbrack" |
// Also allow the empty match
"."
}
// NOTE: precedence works differently for rules and token, in the rule
// you have to give a specifier !foo which is defined in the @precedence
// block here.
@precedence {
section @left,
argument @left // make CtrlSeq arguments left associative
}

View File

@@ -0,0 +1,747 @@
/* Hand-written tokenizer for LaTeX. */
import { ExternalTokenizer, ContextTracker } from '@lezer/lr'
import {
LiteralArgContent,
SpaceDelimitedLiteralArgContent,
VerbContent,
VerbatimContent,
LstInlineContent,
Begin,
End,
KnownEnvironment,
Csname,
TrailingWhitespaceOnly,
TrailingContent,
RefCtrlSeq,
RefStarrableCtrlSeq,
CiteCtrlSeq,
CiteStarrableCtrlSeq,
LabelCtrlSeq,
MathTextCtrlSeq,
HboxCtrlSeq,
TitleCtrlSeq,
AuthorCtrlSeq,
AffilCtrlSeq,
AffiliationCtrlSeq,
DateCtrlSeq,
DocumentClassCtrlSeq,
UsePackageCtrlSeq,
HrefCtrlSeq,
UrlCtrlSeq,
VerbCtrlSeq,
LstInlineCtrlSeq,
IncludeGraphicsCtrlSeq,
CaptionCtrlSeq,
DefCtrlSeq,
LetCtrlSeq,
LeftCtrlSeq,
RightCtrlSeq,
NewCommandCtrlSeq,
RenewCommandCtrlSeq,
NewEnvironmentCtrlSeq,
RenewEnvironmentCtrlSeq,
DocumentEnvName,
TabularEnvName,
EquationEnvName,
EquationArrayEnvName,
VerbatimEnvName,
TikzPictureEnvName,
FigureEnvName,
OpenParenCtrlSym,
CloseParenCtrlSym,
OpenBracketCtrlSym,
CloseBracketCtrlSym,
LineBreakCtrlSym,
// Sectioning commands
BookCtrlSeq,
PartCtrlSeq,
ChapterCtrlSeq,
SectionCtrlSeq,
SubSectionCtrlSeq,
SubSubSectionCtrlSeq,
ParagraphCtrlSeq,
SubParagraphCtrlSeq,
InputCtrlSeq,
IncludeCtrlSeq,
ItemCtrlSeq,
NewTheoremCtrlSeq,
TheoremStyleCtrlSeq,
BibliographyCtrlSeq,
BibliographyStyleCtrlSeq,
CenteringCtrlSeq,
ListEnvName,
MaketitleCtrlSeq,
TextColorCtrlSeq,
ColorBoxCtrlSeq,
HLineCtrlSeq,
TopRuleCtrlSeq,
MidRuleCtrlSeq,
BottomRuleCtrlSeq,
TableEnvName,
MultiColumnCtrlSeq,
ParBoxCtrlSeq,
// Marker for end of argument lists
endOfArguments,
hasMoreArguments,
hasMoreArgumentsOrOptionals,
endOfArgumentsAndOptionals,
TextBoldCtrlSeq,
TextItalicCtrlSeq,
TextSmallCapsCtrlSeq,
TextTeletypeCtrlSeq,
TextMediumCtrlSeq,
TextSansSerifCtrlSeq,
TextSuperscriptCtrlSeq,
TextSubscriptCtrlSeq,
TextStrikeOutCtrlSeq,
EmphasisCtrlSeq,
UnderlineCtrlSeq,
SetLengthCtrlSeq,
} from './latex.terms.mjs'
const MAX_ARGUMENT_LOOKAHEAD = 100
function nameChar(ch) {
// we accept A-Z a-z 0-9 * + @ in environment names
return (
(ch >= 65 && ch <= 90) ||
(ch >= 97 && ch <= 122) ||
(ch >= 48 && ch <= 57) ||
ch === 42 ||
ch === 43 ||
ch === 64
)
}
// match [a-zA-Z]
function alphaChar(ch) {
return (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
}
let cachedName = null
let cachedInput = null
let cachedPos = 0
function envNameAfter(input, offset) {
const pos = input.pos + offset
if (cachedInput === input && cachedPos === pos) {
return cachedName
}
if (input.peek(offset) !== '{'.charCodeAt(0)) return
offset++
let name = ''
for (;;) {
const next = input.peek(offset)
if (!nameChar(next)) break
name += String.fromCharCode(next)
offset++
}
cachedInput = input
cachedPos = pos
return (cachedName = name || null)
}
function ElementContext(name, parent) {
this.name = name
this.parent = parent
this.hash = parent ? parent.hash : 0
for (let i = 0; i < name.length; i++)
this.hash +=
(this.hash << 4) + name.charCodeAt(i) + (name.charCodeAt(i) << 8)
}
export const elementContext = new ContextTracker({
start: null,
shift(context, term, stack, input) {
return term === Begin
? new ElementContext(envNameAfter(input, '\\begin'.length) || '', context)
: context
},
reduce(context, term) {
return term === KnownEnvironment && context ? context.parent : context
},
reuse(context, node, _stack, input) {
const type = node.type.id
return type === Begin
? new ElementContext(envNameAfter(input, 0) || '', context)
: context
},
hash(context) {
return context ? context.hash : 0
},
strict: false,
})
// tokenizer for \verb|...| commands
export const verbTokenizer = new ExternalTokenizer(
(input, stack) => {
if (input.next === '*'.charCodeAt(0)) input.advance()
const delimiter = input.next
if (delimiter === -1) return // hit end of file
if (/\s|\*/.test(String.fromCharCode(delimiter))) return // invalid delimiter
input.advance()
for (;;) {
const next = input.next
if (next === -1 || next === CHAR_NEWLINE) return
input.advance()
if (next === delimiter) break
}
return input.acceptToken(VerbContent)
},
{ contextual: false }
)
// tokenizer for \lstinline|...| commands
export const lstinlineTokenizer = new ExternalTokenizer(
(input, stack) => {
let delimiter = input.next
if (delimiter === -1) return // hit end of file
if (/\s/.test(String.fromCharCode(delimiter))) {
return // invalid delimiter
}
if (delimiter === CHAR_OPEN_BRACE) {
delimiter = CHAR_CLOSE_BRACE
}
input.advance()
for (;;) {
const next = input.next
if (next === -1 || next === CHAR_NEWLINE) return
input.advance()
if (next === delimiter) break
}
return input.acceptToken(LstInlineContent)
},
{ contextual: false }
)
const matchForward = (input, expected, offset = 0) => {
for (let i = 0; i < expected.length; i++) {
if (String.fromCharCode(input.peek(offset + i)) !== expected[i]) {
return false
}
}
return true
}
// tokenizer for \begin{verbatim}...\end{verbatim} environments
export const verbatimTokenizer = new ExternalTokenizer(
(input, stack) => {
const delimiter = '\\end{' + stack.context.name + '}'
for (let offset = 0; ; offset++) {
const next = input.peek(offset)
if (next === -1 || matchForward(input, delimiter, offset)) {
return input.acceptToken(VerbatimContent, offset)
}
}
},
{ contextual: false }
)
// tokenizer for \href{...} and similar commands
export const literalArgTokenizer = new ExternalTokenizer(
input => {
for (let offset = 0; ; offset++) {
const next = input.peek(offset)
if (next === -1 || next === CHAR_CLOSE_BRACE) {
return input.acceptToken(LiteralArgContent, offset)
}
}
},
{ contextual: false }
)
// tokenizer for literal content delimited by whitespace, such as in `\input foo.tex`
export const spaceDelimitedLiteralArgTokenizer = new ExternalTokenizer(
input => {
for (let offset = 0; ; offset++) {
const next = input.peek(offset)
if (next === -1 || next === CHAR_SPACE || next === CHAR_NEWLINE) {
return input.acceptToken(SpaceDelimitedLiteralArgContent, offset)
}
}
},
{ contextual: false }
)
// helper function to look up charCodes
function _char(s) {
return s.charCodeAt(0)
}
const CHAR_BACKSLASH = _char('\\')
const CHAR_OPEN_BRACE = _char('{')
const CHAR_OPEN_BRACKET = _char('[')
const CHAR_CLOSE_BRACE = _char('}')
const CHAR_TAB = _char('\t')
const CHAR_SPACE = _char(' ')
const CHAR_NEWLINE = _char('\n')
const lookaheadTokenizer = getToken =>
new ExternalTokenizer(
input => {
for (let i = 0; i < MAX_ARGUMENT_LOOKAHEAD; ++i) {
const next = input.peek(i)
if (next === CHAR_SPACE || next === CHAR_TAB) {
continue
}
const token = getToken(next)
if (token) {
input.acceptToken(token)
return
}
}
},
{ contextual: false, fallback: true }
)
export const argumentListTokenizer = lookaheadTokenizer(next => {
if (next === CHAR_OPEN_BRACE) {
return hasMoreArguments
} else {
return endOfArguments
}
})
export const argumentListWithOptionalTokenizer = lookaheadTokenizer(next => {
if (next === CHAR_OPEN_BRACE || next === CHAR_OPEN_BRACKET) {
return hasMoreArgumentsOrOptionals
} else {
return endOfArgumentsAndOptionals
}
})
const CHAR_AT_SYMBOL = _char('@')
export const csnameTokenizer = new ExternalTokenizer((input, stack) => {
let offset = 0
let end = -1
// look at the first character, we are looking for acceptable control sequence names
// including @ signs, \\[a-zA-Z@]+
const next = input.peek(offset)
if (next === -1) {
return
}
// reject anything not starting with a backslash,
// we only accept control sequences
if (next !== CHAR_BACKSLASH) {
return
}
offset++
for (;;) {
const next = input.peek(offset)
// stop when we reach the end of file or a non-csname character
if (next === -1 || !(alphaChar(next) || next === CHAR_AT_SYMBOL)) {
end = offset - 1
break
}
end = offset
offset++
}
if (end === -1) return
// accept the content as a valid control sequence
return input.acceptToken(Csname, end + 1)
})
const END_DOCUMENT_MARK = '\\end{document}'.split('').reverse()
export const trailingContentTokenizer = new ExternalTokenizer(
(input, stack) => {
if (input.next === -1) return // no trailing content
// Look back for end-document mark, bail out if any characters do not match
for (let i = 1; i < END_DOCUMENT_MARK.length + 1; i++) {
if (String.fromCharCode(input.peek(-i)) !== END_DOCUMENT_MARK[i - 1]) {
return
}
}
while (input.next === CHAR_SPACE || input.next === CHAR_NEWLINE) {
const next = input.advance()
if (next === -1) return input.acceptToken(TrailingWhitespaceOnly) // trailing whitespace only
}
// accept the all content up to the end of the document
while (input.advance() !== -1) {
//
}
return input.acceptToken(TrailingContent)
}
)
const refCommands = new Set([
'\\fullref',
'\\Vref',
'\\autopageref',
'\\autoref',
'\\eqref',
'\\labelcpageref',
'\\labelcref',
'\\lcnamecref',
'\\lcnamecrefs',
'\\namecref',
'\\nameCref',
'\\namecrefs',
'\\nameCrefs',
'\\thnameref',
'\\thref',
'\\titleref',
'\\vrefrange',
'\\Crefrange',
'\\Crefrang',
'\\fref',
'\\pref',
'\\tref',
'\\Aref',
'\\Bref',
'\\Pref',
'\\Sref',
'\\vref',
'\\nameref',
])
const refStarrableCommands = new Set([
'\\vpageref',
'\\vref',
'\\zcpageref',
'\\zcref',
'\\zfullref',
'\\zref',
'\\zvpageref',
'\\zvref',
'\\cref',
'\\Cref',
'\\pageref',
'\\ref',
'\\Ref',
'\\subref',
'\\zpageref',
'\\ztitleref',
'\\vpagerefrange',
'\\zvpagerefrange',
'\\zvrefrange',
'\\crefrange',
])
const citeCommands = new Set([
'\\autocites',
'\\Autocites',
'\\Cite',
'\\citeA',
'\\citealp',
'\\Citealp',
'\\citealt',
'\\Citealt',
'\\citeauthorNP',
'\\citeauthorp',
'\\Citeauthorp',
'\\citeauthort',
'\\Citeauthort',
'\\citeNP',
'\\citenum',
'\\citen',
'\\citeonline',
'\\cites',
'\\Cites',
'\\citeurl',
'\\citeyearpar',
'\\defcitealias',
'\\fnotecite',
'\\footcite',
'\\footcitetext',
'\\footfullcite',
'\\footnotecites',
'\\Footnotecites',
'\\fullcite',
'\\fullciteA',
'\\fullciteauthor',
'\\fullciteauthorNP',
'\\maskcite',
'\\maskciteA',
'\\maskcitealp',
'\\maskCitealp',
'\\maskcitealt',
'\\maskCitealt',
'\\maskciteauthor',
'\\maskciteauthorNP',
'\\maskciteauthorp',
'\\maskCiteauthorp',
'\\maskciteauthort',
'\\maskCiteauthort',
'\\maskciteNP',
'\\maskcitenum',
'\\maskcitep',
'\\maskCitep',
'\\maskcitepalias',
'\\maskcitet',
'\\maskCitet',
'\\maskcitetalias',
'\\maskciteyear',
'\\maskciteyearNP',
'\\maskciteyearpar',
'\\maskfullcite',
'\\maskfullciteA',
'\\maskfullciteauthor',
'\\maskfullciteauthorNP',
'\\masknocite',
'\\maskshortcite',
'\\maskshortciteA',
'\\maskshortciteauthor',
'\\maskshortciteauthorNP',
'\\maskshortciteNP',
'\\mautocite',
'\\Mautocite',
'\\mcite',
'\\Mcite',
'\\mfootcite',
'\\mfootcitetext',
'\\mparencite',
'\\Mparencite',
'\\msupercite',
'\\mtextcite',
'\\Mtextcite',
'\\nocite',
'\\nocitemeta',
'\\notecite',
'\\Parencite',
'\\parencites',
'\\Parencites',
'\\pnotecite',
'\\shortcite',
'\\shortciteA',
'\\shortciteauthor',
'\\shortciteauthorNP',
'\\shortciteNP',
'\\smartcite',
'\\Smartcite',
'\\smartcites',
'\\Smartcites',
'\\supercite',
'\\supercites',
'\\textcite',
'\\Textcite',
'\\textcites',
'\\Textcites',
])
const citeStarredCommands = new Set([
'\\cite',
'\\citeauthor',
'\\Citeauthor',
'\\citedate',
'\\citep',
'\\citepalias',
'\\Citep',
'\\citetitle',
'\\citeyear',
'\\parencite',
'\\citet',
'\\citetalias',
'\\autocite',
'\\Autocite',
])
const labelCommands = new Set(['\\label', '\\thlabel', '\\zlabel'])
const mathTextCommands = new Set(['\\text', '\\tag', '\\textrm', '\\intertext'])
const otherKnowncommands = {
'\\hbox': HboxCtrlSeq,
'\\title': TitleCtrlSeq,
'\\author': AuthorCtrlSeq,
'\\affil': AffilCtrlSeq,
'\\affiliation': AffiliationCtrlSeq,
'\\date': DateCtrlSeq,
'\\documentclass': DocumentClassCtrlSeq,
'\\usepackage': UsePackageCtrlSeq,
'\\href': HrefCtrlSeq,
'\\url': UrlCtrlSeq,
'\\verb': VerbCtrlSeq,
'\\lstinline': LstInlineCtrlSeq,
'\\includegraphics': IncludeGraphicsCtrlSeq,
'\\caption': CaptionCtrlSeq,
'\\def': DefCtrlSeq,
'\\let': LetCtrlSeq,
'\\left': LeftCtrlSeq,
'\\right': RightCtrlSeq,
'\\newcommand': NewCommandCtrlSeq,
'\\renewcommand': RenewCommandCtrlSeq,
'\\newenvironment': NewEnvironmentCtrlSeq,
'\\renewenvironment': RenewEnvironmentCtrlSeq,
'\\book': BookCtrlSeq,
'\\part': PartCtrlSeq,
'\\addpart': PartCtrlSeq,
'\\chapter': ChapterCtrlSeq,
'\\addchap': ChapterCtrlSeq,
'\\section': SectionCtrlSeq,
'\\addseq': SectionCtrlSeq,
'\\subsection': SubSectionCtrlSeq,
'\\subsubsection': SubSubSectionCtrlSeq,
'\\paragraph': ParagraphCtrlSeq,
'\\subparagraph': SubParagraphCtrlSeq,
'\\input': InputCtrlSeq,
'\\include': IncludeCtrlSeq,
'\\item': ItemCtrlSeq,
'\\centering': CenteringCtrlSeq,
'\\newtheorem': NewTheoremCtrlSeq,
'\\theoremstyle': TheoremStyleCtrlSeq,
'\\bibliography': BibliographyCtrlSeq,
'\\bibliographystyle': BibliographyStyleCtrlSeq,
'\\maketitle': MaketitleCtrlSeq,
'\\textcolor': TextColorCtrlSeq,
'\\colorbox': ColorBoxCtrlSeq,
'\\hline': HLineCtrlSeq,
'\\toprule': TopRuleCtrlSeq,
'\\midrule': MidRuleCtrlSeq,
'\\bottomrule': BottomRuleCtrlSeq,
'\\multicolumn': MultiColumnCtrlSeq,
'\\parbox': ParBoxCtrlSeq,
'\\textbf': TextBoldCtrlSeq,
'\\textit': TextItalicCtrlSeq,
'\\textsc': TextSmallCapsCtrlSeq,
'\\texttt': TextTeletypeCtrlSeq,
'\\textmd': TextMediumCtrlSeq,
'\\textsf': TextSansSerifCtrlSeq,
'\\textsuperscript': TextSuperscriptCtrlSeq,
'\\textsubscript': TextSubscriptCtrlSeq,
'\\sout': TextStrikeOutCtrlSeq,
'\\emph': EmphasisCtrlSeq,
'\\underline': UnderlineCtrlSeq,
'\\setlength': SetLengthCtrlSeq,
}
// specializer for control sequences
// return new tokens for specific control sequences
export const specializeCtrlSeq = (name, terms) => {
if (name === '\\begin') return Begin
if (name === '\\end') return End
if (refCommands.has(name)) {
return RefCtrlSeq
}
if (refStarrableCommands.has(name)) {
return RefStarrableCtrlSeq
}
if (citeCommands.has(name)) {
return CiteCtrlSeq
}
if (citeStarredCommands.has(name)) {
return CiteStarrableCtrlSeq
}
if (labelCommands.has(name)) {
return LabelCtrlSeq
}
if (mathTextCommands.has(name)) {
return MathTextCtrlSeq
}
return otherKnowncommands[name] || -1
}
const tabularEnvNames = new Set([
'tabular',
'xltabular',
'tabularx',
'longtable',
])
const equationEnvNames = new Set([
'equation',
'equation*',
'displaymath',
'displaymath*',
'math',
'math*',
'multline',
'multline*',
'matrix',
'tikzcd',
])
const equationArrayEnvNames = new Set([
'array',
'eqnarray',
'eqnarray*',
'align',
'align*',
'alignat',
'alignat*',
'flalign',
'flalign*',
'gather',
'gather*',
'pmatrix',
'pmatrix*',
'bmatrix',
'bmatrix*',
'Bmatrix',
'Bmatrix*',
'vmatrix',
'vmatrix*',
'Vmatrix',
'Vmatrix*',
'smallmatrix',
'smallmatrix*',
'split',
'split*',
'gathered',
'gathered*',
'aligned',
'aligned*',
'alignedat',
'alignedat*',
'cases',
'cases*',
'dcases',
'dcases*',
'rcases',
'rcases*',
'IEEEeqnarray',
'IEEEeqnarray*',
])
const verbatimEnvNames = new Set([
'verbatim',
'boxedverbatim',
'lstlisting',
'minted',
'Verbatim',
'lstlisting',
'tcblisting',
'codeexample',
'comment',
])
const otherKnownEnvNames = {
document: DocumentEnvName,
tikzpicture: TikzPictureEnvName,
figure: FigureEnvName,
'figure*': FigureEnvName,
subfigure: FigureEnvName,
enumerate: ListEnvName,
itemize: ListEnvName,
table: TableEnvName,
description: ListEnvName,
}
export const specializeEnvName = (name, terms) => {
if (tabularEnvNames.has(name)) {
return TabularEnvName
}
if (equationEnvNames.has(name)) {
return EquationEnvName
}
if (equationArrayEnvNames.has(name)) {
return EquationArrayEnvName
}
if (verbatimEnvNames.has(name)) {
return VerbatimEnvName
}
return otherKnownEnvNames[name] || -1
}
const otherKnownCtrlSyms = {
'\\(': OpenParenCtrlSym,
'\\)': CloseParenCtrlSym,
'\\[': OpenBracketCtrlSym,
'\\]': CloseBracketCtrlSym,
'\\\\': LineBreakCtrlSym,
}
export const specializeCtrlSym = (name, terms) => {
return otherKnownCtrlSyms[name] || -1
}