howl.util.lpeg_lexer

the module can be called directly to create a lexer (same as new())

assert.not_has_error -> l -> true

the resulting lexer can be called directly

lexer = l -> P'x' * Cp!
assert.same { 2 }, lexer 'x'

imports lpeg definitions locally into the module

for op in *{'Cp', 'Ct', 'S', 'P'}
  assert.is_not_nil l[op]

imports lpeg.locale definitions locally into the module

for ldef in *{'digit', 'upper', 'print', 'lower'}
  assert.is_not_nil l[ldef]

new(definition)

accepts a function

assert.not_has_error -> l.new -> true

capture(style, pattern)

returns a LPeg pattern

assert.equal 'pattern', lpeg.type l.capture('foo', P(1))

the returned pattern produces the three captures <start-pos>, <style-name> and <end-pos> if <pattern> matches

p = l.capture 'foo', P'fo'
assert.same { 1, 'foo', 3 }, { p\match 'foobar' }

predefined helper patterns

.eol

matches and consumes new lines

assert.is_not_nil l.eol\match '\n'
assert.is_not_nil l.eol\match '\r'
assert.equals 2, (l.eol * Cp!)\match '\n'
assert.equals 3, (l.eol * Cp!)\match '\r\n'

assert.is_nil l.eol\match 'a'
assert.is_nil l.eol\match '2'

.float

matches and consumes various float representations

for repr in *{ '34.5', '3.45e2', '1.234E1', '3.45e-2', '.32' }
  assert.is_not_nil l.float\match repr

.hexadecimal

matches and consumes various hexadecimal representations

for repr in *{ '0xfeab', '0XDEADBEEF' }
  assert.is_not_nil l.hexadecimal\match repr

does not match illegal hexadecimal representations

assert.is_nil l.hexadecimal\match '0xCDEFG'

.hexadecimal_float

matches and consumes various hexadecimal float representations

for repr in *{ '0xfep2', '0XAP-3' }
  assert.is_not_nil l.hexadecimal_float\match repr

does not match illegal hexadecimal representations

assert.is_nil l.hexadecimal_float\match '0xFGp3'

.octal

matches and consumes octal representations

assert.is_not_nil l.octal\match '0123'

does not match illegal octal representations

assert.is_nil l.octal\match '0128'

.line_start

matches after newline or at start of text

assert.is_not_nil l.line_start\match 'x'
assert.is_not_nil (l.eol * l.line_start * P'x')\match '\nx'

does not consume anything

assert.equals 2, (l.eol * l.line_start * Cp!)\match '\nx'

any(list)

the resulting pattern is an ordered match of any member of <list>

p = l.any { 'one', 'two' }
assert.is_not_nil p\match 'one'
assert.is_not_nil p\match 'two'
assert.is_nil p\match 'three'

<list> can be vararg arguments

p = l.any 'one', 'two'
assert.is_not_nil p\match 'two'

sequence(list)

the resulting pattern is a chained match of all members of <list>

p = l.sequence { 'one', 'two' }
assert.is_nil p\match 'one'
assert.is_nil p\match 'two'
assert.is_not_nil p\match 'onetwo'
assert.is_nil p\match 'Xonetwo'

<list> can be vararg arguments

p = l.sequence 'one', 'two'
assert.is_not_nil p\match 'onetwo'

word(list)

grammar = P {
  V'word' + P(1) * V(1)
  word: l.word { 'one', 'two2' }
}

returns a pattern who matches any word in <list>

assert.is_not_nil grammar\match 'one'
assert.is_not_nil grammar\match 'so one match'
assert.is_not_nil grammar\match '!one'
assert.is_not_nil grammar\match 'one()'
assert.is_not_nil grammar\match 'then two2,'
assert.is_nil grammar\match 'three'

only matches standalone words, not substring occurences

assert.is_nil grammar\match 'fone'
assert.is_nil grammar\match 'one2'
assert.is_nil grammar\match 'two2fold'
assert.is_nil grammar\match 'two2_fold'

accepts var arg parameters

assert.is_not_nil l.word('one', 'two')\match 'two'

separate(p)

returns a pattern that only matches if not part of a word

p = l.separate(P'foo')
assert.is_not_nil p\match 'foo'
assert.is_not_nil (l.blank * p)\match ' foo'
assert.is_not_nil (p)\match 'foo '
assert.is_not_nil (p)\match 'foo*'
assert.is_nil (l.alpha * p)\match 'xfoo '
assert.is_nil (P(1) * p)\match '_foo '
assert.is_nil p\match 'foox '
assert.is_nil p\match 'foo_ '

span(start_p, stop_p [, escape_p])

p = l.span('{', '}') * Cp!

matches and consumes from <start_p> up to and including <stop_p>

assert.equals 3, p\match '{}'
assert.equals 5, p\match '{xx}'

always considers <EOF> as an alternate stop marker

assert.equals 3, p\match '{x'

allows escaping <stop_p> with <escape_p>

p = l.span('{', '}', '\\') * Cp!
assert.equals 5, p\match '{\\}}'

paired(p, escape [, pair_style, content_style])

p = l.paired(1) * Cp!

matches and consumes from <p> up to and including the matching <p>

assert.equals 3, p\match '||x'
assert.equals 5, p\match '|xx|x'

always considers <EOF> as an alternate stop marker

assert.equals 3, p\match '|x'

allows escaping the end delimiter with <escape>

p = l.paired(1, '\\') * Cp!
assert.equals 5, p\match '|\\|| foo\\'

(when pair_style and content_style are specified)

captures the components in the specified styles

p = l.paired(1, nil, 'keyword', 'string')
expected = {
  1, 'keyword', 2,
  2, 'string', 5,
  5, 'keyword', 6,
}
assert.same expected, { p\match '|foo|' }

still handles escapes properly

p = l.paired(1, '%', 'keyword', 'string')
expected = {
  1, 'keyword', 2,
  2, 'string', 6,
  6, 'keyword', 7,
}
assert.same expected, { p\match '|f%|o|' }

back_was(name, value)

p = Cg(l.alpha^1, 'group') * ' ' * l.back_was('group', 'foo')

matches if the named capture <named> previously matched <value>

assert.is_not_nil p\match 'foo '

does not match if the named capture <named> did not match <value>

assert.is_nil p\match 'bar '

produces no captures

assert.equals 1, #{ p\match 'foo ' }

last_token_matches(pattern)

matches if the last non-blank token matches pattern

p = l.blank^0 * l.digit^1 * l.blank^0 * l.last_token_matches(l.digit)
assert.is_not_nil p\match '123 '
assert.is_not_nil p\match '123 \t '
assert.is_not_nil p\match ' 123 '
assert.is_not_nil p\match ' 1 '
assert.is_not_nil p\match '1 '
assert.is_not_nil p\match '1 '
assert.is_not_nil p\match ' 1'

match_back(name)

p = Cg(P'x', 'start') * 'y' * l.match_back('start')

matches the named capture given by <name>

assert.equals 4, p\match 'xyxzx'

produces no captures

assert.equals 1, #{ p\match 'xyxzx' }

scan_until(stop_p [, escape_p])

matches until the specified pattern or <EOF>

assert.equals 3, (l.scan_until('x') * Cp!)\match '12x'
assert.equals 4, (l.scan_until('x') * Cp!)\match '123'

allows escaping <stop_p> with <escape_p>

p = l.scan_until('}', '\\') * Cp!
assert.equals 4, p\match '{\\}}'

scan_to(stop_p [, escape_p])

matches until the specified pattern or <EOF>

assert.equals 4, (l.scan_to('x') * Cp!)\match '12x'
assert.equals 4, (l.scan_to('x') * Cp!)\match '123'

allows escaping <stop_p> with <escape_p>

p = l.scan_to('}', '\\') * Cp!
assert.equals 5, p\match '{\\}}'

scan_through_indented

p = P' ' * l.scan_through_indented! * Cp!

matches until the indentation is smaller or equal to the current line

assert.equals 4, p\match ' x\n y'
assert.equals 8, p\match ' x\n  y\n z'

matches until eol if it can not find any line with smaller or equal indentation

assert.equals 7, p\match ' x\n  y'

uses the indentation of the line containing eol if positioned right at it

p = l.eol * l.scan_through_indented! * Cp!
assert.equals 8, p\match ' x\n  y\n z', 3

scan_until_capture(name, escape, [, halt_at, halt_at_N, ..])

matches until the named capture

p = Cg('x', 'start') * l.scan_until_capture('start')
assert.equals 4, p\match 'xyzx'

stops matching at any optional halt_at parameters

p = Cg('x', 'start') * l.scan_until_capture('start', nil, 'z')
assert.equals 3, p\match 'xyzx'

treats all stop parameters as strings and not patterns

p = Cg('x', 'start') * l.scan_until_capture('start', nil, '%w')
assert.equals 4, p\match 'xyz%w'

does not halt on escaped matches

p = Cg('x', 'start') * l.scan_until_capture('start', '\\', 'z')
assert.equals 7, p\match 'xy\\x\\zx'

matches until eof if no match is found

p = Cg('x', 'start') * l.scan_until_capture('start')
assert.equals 4, p\match 'xyz'

match_until(stop_p, p)

p = l.match_until('\n', C(l.alpha)) * Cp!

matches p until stop_p matches

assert.same { 'x', 'y', 'z', 4 }, { p\match 'xyz\nx' }

matches until eof if stop_p is not found

assert.same { 'x', 'y', 3 }, { p\match 'xy' }

complement(p)

matches if <p> does not match

assert.is_not_nil l.complement('a')\match 'b'
assert.is_nil l.complement('a')\match 'a'
assert.equals 3, (l.complement('a')^1 * Cp!)\match 'bca'

sub_lex_by_pattern(mode_p, mode_style, stop_p)

lexes any leading space followed by eol as extended whitespace

p = l.sub_lex_by_pattern(l.alpha^1, 'keyword', '>')
res = { p\match 'xx \n123>' }
assert.same {
  1, 'keyword', 3,
  3, 'default:whitespace', 5,
  5, 'embedded', 8
}, res

(when no mode is found for the <mode_p> capture)

emits mode match styling and an embedded capture for the sub text

p = l.sub_lex_by_pattern(l.alpha^1, 'keyword', '>')
res = { p\match 'xx123>' }
assert.same {
  1, 'keyword', 3,
  3, 'embedded', 6
}, res

(when a mode matching the <mode_p> capture exists)

local p

before_each ->
  sub_mode = lexer: l -> capture('number', digit^1)
  mode.register name: 'dynsub', create: -> sub_mode
  p = l.P'<' * l.sub_lex_by_pattern(l.alpha^1, 'keyword', '>')

after_each ->
  mode.unregister 'dynsub'

emits mode match styling and rebasing instructions to the styler

assert.same {
  2, 'keyword', 8,
  8, {}, 'dynsub|embedded'
}, { p\match '<dynsub>' }

lexes the content using that mode's lexer until <stop_p>

assert.same {
  2, 'keyword', 8,
  8, { 1, 'number', 4 }, 'dynsub|embedded'
}, { p\match '<dynsub123>' }

sub_lex(mode_name, stop_p)

lexes any leading space followed by eol as extended whitespace

p = l.sub_lex('unknown', '>')
res = { p\match ' \n123>' }
assert.same {
  1, 'default:whitespace', 3,
  3, 'embedded', 6
}, res

(when no mode is found matching <mode_name>)

captures using the embedded style until stop_p

p = l.sub_lex('unknown', '>')
res = { p\match 'xx>' }
assert.same {1, 'embedded', 3}, res

(when a mode matching <mode_name> exists)

local p

before_each ->
  sub_mode = lexer: l -> capture('number', digit^1)
  mode.register name: 'sub', create: -> sub_mode
  p = l.sub_lex('sub', '>')

after_each ->
  mode.unregister 'sub'

sub_captures_for = (text) ->
  res = { p\match text }
  res[2]

emits rebasing instructions to the styler

assert.same { 1, {}, 'sub|embedded' }, { p\match '' }

lexes the content using that mode's lexer until <stop_p>

assert.same {1, 'number', 3}, sub_captures_for '12>'

lexes until EOF if <stop_p> is not found

assert.same {1, 'number', 3}, sub_captures_for '12'

sub_lex_by_lexer(name, base_style, lexer)

sub lexes using the provided lexer

sub_lexer = l -> capture('number', digit^1)
lexer = l -> sequence {
  capture('keyword', 'x'),
  sub_lex_by_lexer('string', l.eol, sub_lexer)
}
assert.same {
  1, 'keyword', 2
  2, { 1, 'number', 2 }, 'inline|string'

}, lexer('x2')

sub_lex_by_inline(base_style, match_p, pattern)

sub lexes the matched text using the provided pattern

lexer = l ->
  sub_lexer = capture('number', digit^1)
  sequence {
    capture('keyword', 'x'),
    sub_lex_by_inline('string', l.scan_until(l.eol), sub_lexer)
  }
assert.same {
  1, 'keyword', 2
  2, { 1, 'number', 2 }, 'inline|string'

}, lexer('x2')

adds a zero width styling instruction at the end if needed

lexer = l ->
  sub_lexer = capture('number', digit^1)
  alpha * sub_lex_by_inline('string', l.scan_until(l.eol), sub_lexer)

assert.same {
  2, {
    1, 'number', 2,
    3, 'whitespace', 3
  }, 'inline|string'
}, lexer('x2x')

compose(base_mode, pattern)

returns a conjunction pattern with <pattern> and the mode pattern

base_mode = lexer: l -> capture('number', digit^1)
mode.register name: 'base_mode', create: -> base_mode
p = l.compose('base_mode', l.capture('override', l.alpha))^0
assert.same {
  1, 'override', 2,
  2, 'number', 3
}, { p\match 'a2' }

built-in lexing support

automatically lexes whitespace

lexer = l -> P'peace-and-quiet'
assert.same { 1, 'whitespace', 3 }, lexer ' \n'

automatically skips non-recognized tokens

lexer = l -> capture 'foo', P'foo'
assert.same { 2, 'foo', 5 }, lexer '|foo'