Fix import utils

Quincunx271 · Quincunx271 · commit 5363c478ef03 · 2021-08-04T16:44:05.000-07:00
diff --git a/util/analyze/imports/import_cpu2006.py b/util/analyze/imports/import_cpu2006.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import os
-import re
 
 from . import import_utils
 
@@ -13,8 +12,8 @@ def parse(file):
     with open(file, 'r') as f:
         return import_utils.parse_multi_bench_file(
             f.read(),
-            benchstart=re.compile(r'Building (?P<name>\S*)'),
-            filename=re.compile(r'/[fc]lang\b.*\s(\S+\.\S+)\n'))
+            benchstart=r'Building (?P<name>\S*)',
+            filename=r'/[fc]lang\b.*\s(\S+\.\S+)\n')
 
 
 if __name__ == '__main__':
diff --git a/util/analyze/imports/import_utils.py b/util/analyze/imports/import_utils.py
@@ -1,13 +1,15 @@
-import pickle
-import json
 import itertools
+import json
+import pickle
 import re
 import sys
-from collections import namedtuple
+from dataclasses import dataclass
+from typing import List, Match, Optional, Pattern, Union
 
-from .._types import Logs, Benchmark, Block
+from .._types import Benchmark, Block, Logs
 
-_RE_REGION_INFO = re.compile(r'EVENT:.*ProcessDag.*"name": "(?P<name>[^"]*)"')
+_REGION_DELIMITER = 'INFO: ********** Opt Scheduling **********'
+_RE_REGION_DELIMITER = re.compile(re.escape(_REGION_DELIMITER))
 
 
 def import_main(parsefn, *, description):
@@ -24,18 +26,39 @@ def import_main(parsefn, *, description):
         pickle.dump(result, f)
 
 
-def parse_multi_bench_file(logtext, *, benchstart, filename=None):
+def parse_multi_bench_file(logtext: str, *, benchstart: Union[Pattern, str], filename: Optional[Union[Pattern, str]] = None):
+    if filename is not None:
+        filename = re.compile(filename)
+    benchstart = re.compile(benchstart)
+
+    def parse_bench(benchm: Match, nextm: Union[Match, _DummyEnd], is_first: bool = False):
+        # The RE can specify any extra properties.
+        info = benchm.groupdict()
+        # If this is the first benchmark in the file, we want to start from the
+        # start of the file so that we don't lose any information.
+        start = 0 if is_first else benchm.start()
+        end = nextm.start()
+        return _parse_benchmark(info, logtext,
+                                start, end,
+                                filenamere=filename)
+
+    bench_matches = list(benchstart.finditer(logtext))
     benchmarks = []
-    for benchm, nextm in _splititer(benchstart, logtext):
-        bench = _parse_benchmark(benchm.groupdict(), logtext,
-                                 benchm.end(), nextm.start(),
-                                 filenamere=filename)
-        benchmarks.append(bench)
+
+    is_first: bool = True
+    for benchm, nextm in zip(
+            bench_matches,
+            [*bench_matches[1:], _DummyEnd(len(logtext))]
+    ):
+        benchmarks.append(parse_bench(benchm, nextm, is_first))
+        is_first = False
 
     return Logs(benchmarks)
 
 
-def parse_single_bench_file(logtext, *, benchname, filename=None):
+def parse_single_bench_file(logtext, *, benchname, filename: Optional[Union[Pattern, str]] = None):
+    if filename is not None:
+        filename = re.compile(filename)
     return Logs([
         _parse_benchmark(
             {'name': benchname},
@@ -45,21 +68,10 @@ def parse_single_bench_file(logtext, *, benchname, filename=None):
     ])
 
 
-_FileInfo = namedtuple('_FileInfo', ('filename', 'from_pos'))
-
-
-def _each_cons(iterable, n):
-    '''
-    Iterates over each consecutive n items of the iterable.
-
-    _each_cons((1, 2, 3, 4), 2) # (1, 2), (2, 3), (3, 4)
-    '''
-    iters = [None] * n
-    iters[0] = iter(iterable)
-    for i in range(1, n):
-        iters[i - 1], iters[i] = itertools.tee(iters[i - 1])
-        next(iters[i], None)
-    return zip(*iters)
+@dataclass
+class _FileInfo:
+    filename: Optional[str]
+    from_pos: int
 
 
 class _DummyEnd:
@@ -73,65 +85,68 @@ def end(self):
         return self._end
 
 
-def _splititer(regex, text, pos=0, endpos=None):
-    '''
-    'Splits' the string by the regular expression, using an iterable.
-    Returns both where the regex matches and where it matched next (or the end).
-    '''
-    if endpos is None:
-        endpos = len(text) - 1
+def _filename_info(filenamere: Optional[Pattern], logtext: str, start: int, end: int) -> List[_FileInfo]:
+    if filenamere is None:
+        filenamere = re.compile(r'.^')  # RE that doesn't match anything
+    files = []
 
-    return _each_cons(
-        itertools.chain(regex.finditer(text, pos, endpos),
-                        (_DummyEnd(endpos + 1),)),
-        2
-    )
+    for filem in filenamere.finditer(logtext, start, end):
+        filename = filem.group(1)
+        filestart = filem.end()
+        files.append(_FileInfo(filename=filename, from_pos=filestart))
 
+    return files
 
-def _parse_benchmark(info, logtext: str, start, end, *, filenamere):
-    NAME = info['name']
+
+def _parse_benchmark(info: dict, logtext: str, start: int, end: int, *, filenamere: Optional[Pattern]):
+    BENCHNAME = info['name']
 
     blocks = []
 
-    if filenamere and filenamere.search(logtext, start, end):
-        files = [
-            *(_FileInfo(filename=r.group(1), from_pos=r.end())
-              for r in filenamere.finditer(logtext, start, end)),
-            _FileInfo(filename=None, from_pos=len(logtext)),
-        ][::-1]
-    else:
-        files = [
-            _FileInfo(filename=None, from_pos=start),
-            _FileInfo(filename=None, from_pos=len(logtext)),
-        ][::-1]
+    files: List[_FileInfo] = _filename_info(filenamere, logtext, start, end)
+    if not files:
+        # We have an unknown file starting from the very beginning
+        files = [_FileInfo(filename=None, from_pos=start)]
+
+    # Allow us to peek ahead by giving a dummy "file" at the end which will never match a block
+    files.append(_FileInfo(filename=None, from_pos=end))
+    assert len(files) >= 2
+    file_pos = 0
+
+    block_matches1, block_matches2 = itertools.tee(_RE_REGION_DELIMITER.finditer(logtext, start, end))
+    next(block_matches2)  # Drop first
+    block_matches2 = itertools.chain(block_matches2, (_DummyEnd(end),))
 
     blocks = []
 
-    for regionm, nextm in _splititer(_RE_REGION_INFO, logtext, start, end):
-        assert regionm.end() > files[-1].from_pos
-        if regionm.end() > files[-2].from_pos:
-            files.pop()
+    is_first = True
+    for regionm, nextm in zip(block_matches1, block_matches2):
+        region_start = regionm.end()
+        if region_start > files[file_pos + 1].from_pos:
+            file_pos += 1
+
+        assert region_start > files[file_pos].from_pos
 
-        try:
-            filename = files[-1].filename
-        except NameError:
-            filename = None
+        filename = files[file_pos].filename if files[file_pos] else None
 
         regioninfo = {
-            'name': regionm['name'],
             'file': filename,
-            'benchmark': NAME,
+            'benchmark': BENCHNAME,
         }
-        block = _parse_block(regioninfo, logtext,
-                             regionm.start() - 1, nextm.start())
-        blocks.append(block)
+        blk_start = start if is_first else regionm.start()
+        blk_end = nextm.start()
+        blocks.append(_parse_block(regioninfo, logtext,
+                                   blk_start, blk_end))
+        is_first = False
 
     return Benchmark(info, blocks)
 
 
 def _parse_block(info, logtext: str, start, end):
     events = _parse_events(logtext, start, end)
     raw_log = logtext[start:end]
+    assert 'ProcessDag' in events
+    info['name'] = events['ProcessDag'][0]['name']
 
     return Block(info, raw_log, events)