contrib, build: bundle LuaSrcDiet and make it available in build targets
[project/luci.git] / contrib / luasrcdiet / lua / optparser.lua
1 --[[--------------------------------------------------------------------
2
3 optparser.lua: does parser-based optimizations
4 This file is part of LuaSrcDiet.
5
6 Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
7 The COPYRIGHT file describes the conditions
8 under which this software may be distributed.
9
10 See the ChangeLog for more information.
11
12 ----------------------------------------------------------------------]]
13
14 --[[--------------------------------------------------------------------
15 -- NOTES:
16 -- * For more parser-based optimization ideas, see the TODO items or
17 -- look at technotes.txt.
18 -- * The processing load is quite significant, but since this is an
19 -- off-line text processor, I believe we can wait a few seconds.
20 -- * TODO: might process "local a,a,a" wrongly... need tests!
21 -- * TODO: remove position handling if overlapped locals (rem < 0)
22 -- needs more study, to check behaviour
23 -- * TODO: there are probably better ways to do allocation, e.g. by
24 -- choosing better methods to sort and pick locals...
25 -- * TODO: we don't need 53*63 two-letter identifiers; we can make
26 -- do with significantly less depending on how many that are really
27 -- needed and improve entropy; e.g. 13 needed -> choose 4*4 instead
28 ----------------------------------------------------------------------]]
29
30 local base = _G
31 local string = require "string"
32 local table = require "table"
33 module "optparser"
34
35 ----------------------------------------------------------------------
36 -- Letter frequencies for reducing symbol entropy (fixed version)
37 -- * Might help a wee bit when the output file is compressed
38 -- * See Wikipedia: http://en.wikipedia.org/wiki/Letter_frequencies
39 -- * We use letter frequencies according to a Linotype keyboard, plus
40 -- the underscore, and both lower case and upper case letters.
41 -- * The arrangement below (LC, underscore, %d, UC) is arbitrary.
42 -- * This is certainly not optimal, but is quick-and-dirty and the
43 -- process has no significant overhead
44 ----------------------------------------------------------------------
45
46 local LETTERS = "etaoinshrdlucmfwypvbgkqjxz_ETAOINSHRDLUCMFWYPVBGKQJXZ"
47 local ALPHANUM = "etaoinshrdlucmfwypvbgkqjxz_0123456789ETAOINSHRDLUCMFWYPVBGKQJXZ"
48
49 -- names or identifiers that must be skipped
50 -- * the first two lines are for keywords
51 local SKIP_NAME = {}
52 for v in string.gmatch([[
53 and break do else elseif end false for function if in
54 local nil not or repeat return then true until while
55 self]], "%S+") do
56 SKIP_NAME[v] = true
57 end
58
59 ------------------------------------------------------------------------
60 -- variables and data structures
61 ------------------------------------------------------------------------
62
63 local toklist, seminfolist, -- token lists
64 globalinfo, localinfo, -- variable information tables
65 globaluniq, localuniq, -- unique name tables
66 var_new, -- index of new variable names
67 varlist -- list of output variables
68
69 ----------------------------------------------------------------------
70 -- preprocess information table to get lists of unique names
71 ----------------------------------------------------------------------
72
73 local function preprocess(infotable)
74 local uniqtable = {}
75 for i = 1, #infotable do -- enumerate info table
76 local obj = infotable[i]
77 local name = obj.name
78 --------------------------------------------------------------------
79 if not uniqtable[name] then -- not found, start an entry
80 uniqtable[name] = {
81 decl = 0, token = 0, size = 0,
82 }
83 end
84 --------------------------------------------------------------------
85 local uniq = uniqtable[name] -- count declarations, tokens, size
86 uniq.decl = uniq.decl + 1
87 local xref = obj.xref
88 local xcount = #xref
89 uniq.token = uniq.token + xcount
90 uniq.size = uniq.size + xcount * #name
91 --------------------------------------------------------------------
92 if obj.decl then -- if local table, create first,last pairs
93 obj.id = i
94 obj.xcount = xcount
95 if xcount > 1 then -- if ==1, means local never accessed
96 obj.first = xref[2]
97 obj.last = xref[xcount]
98 end
99 --------------------------------------------------------------------
100 else -- if global table, add a back ref
101 uniq.id = i
102 end
103 --------------------------------------------------------------------
104 end--for
105 return uniqtable
106 end
107
108 ----------------------------------------------------------------------
109 -- calculate actual symbol frequencies, in order to reduce entropy
110 -- * this may help further reduce the size of compressed sources
111 -- * note that since parsing optimizations is put before lexing
112 -- optimizations, the frequency table is not exact!
113 -- * yes, this will miss --keep block comments too...
114 ----------------------------------------------------------------------
115
116 local function recalc_for_entropy(option)
117 local byte = string.byte
118 local char = string.char
119 -- table of token classes to accept in calculating symbol frequency
120 local ACCEPT = {
121 TK_KEYWORD = true, TK_NAME = true, TK_NUMBER = true,
122 TK_STRING = true, TK_LSTRING = true,
123 }
124 if not option["opt-comments"] then
125 ACCEPT.TK_COMMENT = true
126 ACCEPT.TK_LCOMMENT = true
127 end
128 --------------------------------------------------------------------
129 -- create a new table and remove any original locals by filtering
130 --------------------------------------------------------------------
131 local filtered = {}
132 for i = 1, #toklist do
133 filtered[i] = seminfolist[i]
134 end
135 for i = 1, #localinfo do -- enumerate local info table
136 local obj = localinfo[i]
137 local xref = obj.xref
138 for j = 1, obj.xcount do
139 local p = xref[j]
140 filtered[p] = "" -- remove locals
141 end
142 end
143 --------------------------------------------------------------------
144 local freq = {} -- reset symbol frequency table
145 for i = 0, 255 do freq[i] = 0 end
146 for i = 1, #toklist do -- gather symbol frequency
147 local tok, info = toklist[i], filtered[i]
148 if ACCEPT[tok] then
149 for j = 1, #info do
150 local c = byte(info, j)
151 freq[c] = freq[c] + 1
152 end
153 end--if
154 end--for
155 --------------------------------------------------------------------
156 -- function to re-sort symbols according to actual frequencies
157 --------------------------------------------------------------------
158 local function resort(symbols)
159 local symlist = {}
160 for i = 1, #symbols do -- prepare table to sort
161 local c = byte(symbols, i)
162 symlist[i] = { c = c, freq = freq[c], }
163 end
164 table.sort(symlist, -- sort selected symbols
165 function(v1, v2)
166 return v1.freq > v2.freq
167 end
168 )
169 local charlist = {} -- reconstitute the string
170 for i = 1, #symlist do
171 charlist[i] = char(symlist[i].c)
172 end
173 return table.concat(charlist)
174 end
175 --------------------------------------------------------------------
176 LETTERS = resort(LETTERS) -- change letter arrangement
177 ALPHANUM = resort(ALPHANUM)
178 end
179
180 ----------------------------------------------------------------------
181 -- returns a string containing a new local variable name to use, and
182 -- a flag indicating whether it collides with a global variable
183 -- * trapping keywords and other names like 'self' is done elsewhere
184 ----------------------------------------------------------------------
185
186 local function new_var_name()
187 local var
188 local cletters, calphanum = #LETTERS, #ALPHANUM
189 local v = var_new
190 if v < cletters then -- single char
191 v = v + 1
192 var = string.sub(LETTERS, v, v)
193 else -- longer names
194 local range, sz = cletters, 1 -- calculate # chars fit
195 repeat
196 v = v - range
197 range = range * calphanum
198 sz = sz + 1
199 until range > v
200 local n = v % cletters -- left side cycles faster
201 v = (v - n) / cletters -- do first char first
202 n = n + 1
203 var = string.sub(LETTERS, n, n)
204 while sz > 1 do
205 local m = v % calphanum
206 v = (v - m) / calphanum
207 m = m + 1
208 var = var..string.sub(ALPHANUM, m, m)
209 sz = sz - 1
210 end
211 end
212 var_new = var_new + 1
213 return var, globaluniq[var] ~= nil
214 end
215
216 ----------------------------------------------------------------------
217 -- calculate and print some statistics
218 -- * probably better in main source, put here for now
219 ----------------------------------------------------------------------
220
221 local function stats_summary(globaluniq, localuniq, afteruniq, option)
222 local print = print or base.print
223 local fmt = string.format
224 local opt_details = option.DETAILS
225 local uniq_g , uniq_li, uniq_lo, uniq_ti, uniq_to, -- stats needed
226 decl_g, decl_li, decl_lo, decl_ti, decl_to,
227 token_g, token_li, token_lo, token_ti, token_to,
228 size_g, size_li, size_lo, size_ti, size_to
229 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
231 local function avg(c, l) -- safe average function
232 if c == 0 then return 0 end
233 return l / c
234 end
235 --------------------------------------------------------------------
236 -- collect statistics (note: globals do not have declarations!)
237 --------------------------------------------------------------------
238 for name, uniq in base.pairs(globaluniq) do
239 uniq_g = uniq_g + 1
240 token_g = token_g + uniq.token
241 size_g = size_g + uniq.size
242 end
243 for name, uniq in base.pairs(localuniq) do
244 uniq_li = uniq_li + 1
245 decl_li = decl_li + uniq.decl
246 token_li = token_li + uniq.token
247 size_li = size_li + uniq.size
248 end
249 for name, uniq in base.pairs(afteruniq) do
250 uniq_lo = uniq_lo + 1
251 decl_lo = decl_lo + uniq.decl
252 token_lo = token_lo + uniq.token
253 size_lo = size_lo + uniq.size
254 end
255 uniq_ti = uniq_g + uniq_li
256 decl_ti = decl_g + decl_li
257 token_ti = token_g + token_li
258 size_ti = size_g + size_li
259 uniq_to = uniq_g + uniq_lo
260 decl_to = decl_g + decl_lo
261 token_to = token_g + token_lo
262 size_to = size_g + size_lo
263 --------------------------------------------------------------------
264 -- detailed stats: global list
265 --------------------------------------------------------------------
266 if opt_details then
267 local sorted = {} -- sort table of unique global names by size
268 for name, uniq in base.pairs(globaluniq) do
269 uniq.name = name
270 sorted[#sorted + 1] = uniq
271 end
272 table.sort(sorted,
273 function(v1, v2)
274 return v1.size > v2.size
275 end
276 )
277 local tabf1, tabf2 = "%8s%8s%10s %s", "%8d%8d%10.2f %s"
278 local hl = string.rep("-", 44)
279 print("*** global variable list (sorted by size) ***\n"..hl)
280 print(fmt(tabf1, "Token", "Input", "Input", "Global"))
281 print(fmt(tabf1, "Count", "Bytes", "Average", "Name"))
282 print(hl)
283 for i = 1, #sorted do
284 local uniq = sorted[i]
285 print(fmt(tabf2, uniq.token, uniq.size, avg(uniq.token, uniq.size), uniq.name))
286 end
287 print(hl)
288 print(fmt(tabf2, token_g, size_g, avg(token_g, size_g), "TOTAL"))
289 print(hl.."\n")
290 --------------------------------------------------------------------
291 -- detailed stats: local list
292 --------------------------------------------------------------------
293 local tabf1, tabf2 = "%8s%8s%8s%10s%8s%10s %s", "%8d%8d%8d%10.2f%8d%10.2f %s"
294 local hl = string.rep("-", 70)
295 print("*** local variable list (sorted by allocation order) ***\n"..hl)
296 print(fmt(tabf1, "Decl.", "Token", "Input", "Input", "Output", "Output", "Global"))
297 print(fmt(tabf1, "Count", "Count", "Bytes", "Average", "Bytes", "Average", "Name"))
298 print(hl)
299 for i = 1, #varlist do -- iterate according to order assigned
300 local name = varlist[i]
301 local uniq = afteruniq[name]
302 local old_t, old_s = 0, 0
303 for j = 1, #localinfo do -- find corresponding old names and calculate
304 local obj = localinfo[j]
305 if obj.name == name then
306 old_t = old_t + obj.xcount
307 old_s = old_s + obj.xcount * #obj.oldname
308 end
309 end
310 print(fmt(tabf2, uniq.decl, uniq.token, old_s, avg(old_t, old_s),
311 uniq.size, avg(uniq.token, uniq.size), name))
312 end
313 print(hl)
314 print(fmt(tabf2, decl_lo, token_lo, size_li, avg(token_li, size_li),
315 size_lo, avg(token_lo, size_lo), "TOTAL"))
316 print(hl.."\n")
317 end--if opt_details
318 --------------------------------------------------------------------
319 -- display output
320 --------------------------------------------------------------------
321 local tabf1, tabf2 = "%-16s%8s%8s%8s%8s%10s", "%-16s%8d%8d%8d%8d%10.2f"
322 local hl = string.rep("-", 58)
323 print("*** local variable optimization summary ***\n"..hl)
324 print(fmt(tabf1, "Variable", "Unique", "Decl.", "Token", "Size", "Average"))
325 print(fmt(tabf1, "Types", "Names", "Count", "Count", "Bytes", "Bytes"))
326 print(hl)
327 print(fmt(tabf2, "Global", uniq_g, decl_g, token_g, size_g, avg(token_g, size_g)))
328 print(hl)
329 print(fmt(tabf2, "Local (in)", uniq_li, decl_li, token_li, size_li, avg(token_li, size_li)))
330 print(fmt(tabf2, "TOTAL (in)", uniq_ti, decl_ti, token_ti, size_ti, avg(token_ti, size_ti)))
331 print(hl)
332 print(fmt(tabf2, "Local (out)", uniq_lo, decl_lo, token_lo, size_lo, avg(token_lo, size_lo)))
333 print(fmt(tabf2, "TOTAL (out)", uniq_to, decl_to, token_to, size_to, avg(token_to, size_to)))
334 print(hl.."\n")
335 end
336
337 ----------------------------------------------------------------------
338 -- main entry point
339 -- * does only local variable optimization for now
340 ----------------------------------------------------------------------
341
342 function optimize(option, _toklist, _seminfolist, _globalinfo, _localinfo)
343 -- set tables
344 toklist, seminfolist, globalinfo, localinfo
345 = _toklist, _seminfolist, _globalinfo, _localinfo
346 var_new = 0 -- reset variable name allocator
347 varlist = {}
348 ------------------------------------------------------------------
349 -- preprocess global/local tables, handle entropy reduction
350 ------------------------------------------------------------------
351 globaluniq = preprocess(globalinfo)
352 localuniq = preprocess(localinfo)
353 if option["opt-entropy"] then -- for entropy improvement
354 recalc_for_entropy(option)
355 end
356 ------------------------------------------------------------------
357 -- build initial declared object table, then sort according to
358 -- token count, this might help assign more tokens to more common
359 -- variable names such as 'e' thus possibly reducing entropy
360 -- * an object knows its localinfo index via its 'id' field
361 -- * special handling for "self" special local (parameter) here
362 ------------------------------------------------------------------
363 local object = {}
364 for i = 1, #localinfo do
365 object[i] = localinfo[i]
366 end
367 table.sort(object, -- sort largest first
368 function(v1, v2)
369 return v1.xcount > v2.xcount
370 end
371 )
372 ------------------------------------------------------------------
373 -- the special "self" function parameters must be preserved
374 -- * the allocator below will never use "self", so it is safe to
375 -- keep those implicit declarations as-is
376 ------------------------------------------------------------------
377 local temp, j, gotself = {}, 1, false
378 for i = 1, #object do
379 local obj = object[i]
380 if not obj.isself then
381 temp[j] = obj
382 j = j + 1
383 else
384 gotself = true
385 end
386 end
387 object = temp
388 ------------------------------------------------------------------
389 -- a simple first-come first-served heuristic name allocator,
390 -- note that this is in no way optimal...
391 -- * each object is a local variable declaration plus existence
392 -- * the aim is to assign short names to as many tokens as possible,
393 -- so the following tries to maximize name reuse
394 -- * note that we preserve sort order
395 ------------------------------------------------------------------
396 local nobject = #object
397 while nobject > 0 do
398 local varname, gcollide
399 repeat
400 varname, gcollide = new_var_name() -- collect a variable name
401 until not SKIP_NAME[varname] -- skip all special names
402 varlist[#varlist + 1] = varname -- keep a list
403 local oleft = nobject
404 ------------------------------------------------------------------
405 -- if variable name collides with an existing global, the name
406 -- cannot be used by a local when the name is accessed as a global
407 -- during which the local is alive (between 'act' to 'rem'), so
408 -- we drop objects that collides with the corresponding global
409 ------------------------------------------------------------------
410 if gcollide then
411 -- find the xref table of the global
412 local gref = globalinfo[globaluniq[varname].id].xref
413 local ngref = #gref
414 -- enumerate for all current objects; all are valid at this point
415 for i = 1, nobject do
416 local obj = object[i]
417 local act, rem = obj.act, obj.rem -- 'live' range of local
418 -- if rem < 0, it is a -id to a local that had the same name
419 -- so follow rem to extend it; does this make sense?
420 while rem < 0 do
421 rem = localinfo[-rem].rem
422 end
423 local drop
424 for j = 1, ngref do
425 local p = gref[j]
426 if p >= act and p <= rem then drop = true end -- in range?
427 end
428 if drop then
429 obj.skip = true
430 oleft = oleft - 1
431 end
432 end--for
433 end--if gcollide
434 ------------------------------------------------------------------
435 -- now the first unassigned local (since it's sorted) will be the
436 -- one with the most tokens to rename, so we set this one and then
437 -- eliminate all others that collides, then any locals that left
438 -- can then reuse the same variable name; this is repeated until
439 -- all local declaration that can use this name is assigned
440 -- * the criteria for local-local reuse/collision is:
441 -- A is the local with a name already assigned
442 -- B is the unassigned local under consideration
443 -- => anytime A is accessed, it cannot be when B is 'live'
444 -- => to speed up things, we have first/last accesses noted
445 ------------------------------------------------------------------
446 while oleft > 0 do
447 local i = 1
448 while object[i].skip do -- scan for first object
449 i = i + 1
450 end
451 ------------------------------------------------------------------
452 -- first object is free for assignment of the variable name
453 -- [first,last] gives the access range for collision checking
454 ------------------------------------------------------------------
455 oleft = oleft - 1
456 local obja = object[i]
457 i = i + 1
458 obja.newname = varname
459 obja.skip = true
460 obja.done = true
461 local first, last = obja.first, obja.last
462 local xref = obja.xref
463 ------------------------------------------------------------------
464 -- then, scan all the rest and drop those colliding
465 -- if A was never accessed then it'll never collide with anything
466 -- otherwise trivial skip if:
467 -- * B was activated after A's last access (last < act)
468 -- * B was removed before A's first access (first > rem)
469 -- if not, see detailed skip below...
470 ------------------------------------------------------------------
471 if first and oleft > 0 then -- must have at least 1 access
472 local scanleft = oleft
473 while scanleft > 0 do
474 while object[i].skip do -- next valid object
475 i = i + 1
476 end
477 scanleft = scanleft - 1
478 local objb = object[i]
479 i = i + 1
480 local act, rem = objb.act, objb.rem -- live range of B
481 -- if rem < 0, extend range of rem thru' following local
482 while rem < 0 do
483 rem = localinfo[-rem].rem
484 end
485 --------------------------------------------------------
486 if not(last < act or first > rem) then -- possible collision
487 --------------------------------------------------------
488 -- B is activated later than A or at the same statement,
489 -- this means for no collision, A cannot be accessed when B
490 -- is alive, since B overrides A (or is a peer)
491 --------------------------------------------------------
492 if act >= obja.act then
493 for j = 1, obja.xcount do -- ... then check every access
494 local p = xref[j]
495 if p >= act and p <= rem then -- A accessed when B live!
496 oleft = oleft - 1
497 objb.skip = true
498 break
499 end
500 end--for
501 --------------------------------------------------------
502 -- A is activated later than B, this means for no collision,
503 -- A's access is okay since it overrides B, but B's last
504 -- access need to be earlier than A's activation time
505 --------------------------------------------------------
506 else
507 if objb.last and objb.last >= obja.act then
508 oleft = oleft - 1
509 objb.skip = true
510 end
511 end
512 end
513 --------------------------------------------------------
514 if oleft == 0 then break end
515 end
516 end--if first
517 ------------------------------------------------------------------
518 end--while
519 ------------------------------------------------------------------
520 -- after assigning all possible locals to one variable name, the
521 -- unassigned locals/objects have the skip field reset and the table
522 -- is compacted, to hopefully reduce iteration time
523 ------------------------------------------------------------------
524 local temp, j = {}, 1
525 for i = 1, nobject do
526 local obj = object[i]
527 if not obj.done then
528 obj.skip = false
529 temp[j] = obj
530 j = j + 1
531 end
532 end
533 object = temp -- new compacted object table
534 nobject = #object -- objects left to process
535 ------------------------------------------------------------------
536 end--while
537 ------------------------------------------------------------------
538 -- after assigning all locals with new variable names, we can
539 -- patch in the new names, and reprocess to get 'after' stats
540 ------------------------------------------------------------------
541 for i = 1, #localinfo do -- enumerate all locals
542 local obj = localinfo[i]
543 local xref = obj.xref
544 if obj.newname then -- if got new name, patch it in
545 for j = 1, obj.xcount do
546 local p = xref[j] -- xrefs indexes the token list
547 seminfolist[p] = obj.newname
548 end
549 obj.name, obj.oldname -- adjust names
550 = obj.newname, obj.name
551 else
552 obj.oldname = obj.name -- for cases like 'self'
553 end
554 end
555 ------------------------------------------------------------------
556 -- deal with statistics output
557 ------------------------------------------------------------------
558 if gotself then -- add 'self' to end of list
559 varlist[#varlist + 1] = "self"
560 end
561 local afteruniq = preprocess(localinfo)
562 stats_summary(globaluniq, localuniq, afteruniq, option)
563 ------------------------------------------------------------------
564 end