-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathtokenizer.lua
More file actions
55 lines (45 loc) · 1.03 KB
/
tokenizer.lua
File metadata and controls
55 lines (45 loc) · 1.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
local lexer = require "pl.lexer"
local yield = coroutine.yield
local M = {}
local function word(token)
return yield("word", token)
end
local function quote(token)
return yield("quote", token)
end
local function space(token)
return yield("space", token)
end
local function tag(token)
return yield("tag", token)
end
local function punct(token)
return yield("punct", token)
end
local function endpunct(token)
return yield("endpunct", token)
end
local function unknown(token)
return yield("unknown", token)
end
function M.tokenize(text)
return lexer.scan(text, {
{ "^%s+", space },
{ "^['\"]", quote },
{ "^%w+", word },
{ "^%-+", space },
{ "^[,:;%-]", punct },
{ "^%.+", endpunct },
{ "^[%.%?!]", endpunct },
{ "^</?.->", tag },
{ "^.", unknown },
}, { [space]=true, [tag]=true })
end
function M.join(words)
local s = table.concat(words, " ")
s = s:gsub("^%l", string.upper)
s = s:gsub(" (') ", "%1")
s = s:gsub(" ([,:;%-%.%?!])", "%1")
return s
end
return M