Skip to content

Commit 5363c47

Browse files
committed
Fix import utils
1 parent d9bbcdd commit 5363c47

File tree

2 files changed

+82
-68
lines changed

2 files changed

+82
-68
lines changed

util/analyze/imports/import_cpu2006.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python3
22

33
import os
4-
import re
54

65
from . import import_utils
76

@@ -13,8 +12,8 @@ def parse(file):
1312
with open(file, 'r') as f:
1413
return import_utils.parse_multi_bench_file(
1514
f.read(),
16-
benchstart=re.compile(r'Building (?P<name>\S*)'),
17-
filename=re.compile(r'/[fc]lang\b.*\s(\S+\.\S+)\n'))
15+
benchstart=r'Building (?P<name>\S*)',
16+
filename=r'/[fc]lang\b.*\s(\S+\.\S+)\n')
1817

1918

2019
if __name__ == '__main__':

util/analyze/imports/import_utils.py

+80-65
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
import pickle
2-
import json
31
import itertools
2+
import json
3+
import pickle
44
import re
55
import sys
6-
from collections import namedtuple
6+
from dataclasses import dataclass
7+
from typing import List, Match, Optional, Pattern, Union
78

8-
from .._types import Logs, Benchmark, Block
9+
from .._types import Benchmark, Block, Logs
910

10-
_RE_REGION_INFO = re.compile(r'EVENT:.*ProcessDag.*"name": "(?P<name>[^"]*)"')
11+
_REGION_DELIMITER = 'INFO: ********** Opt Scheduling **********'
12+
_RE_REGION_DELIMITER = re.compile(re.escape(_REGION_DELIMITER))
1113

1214

1315
def import_main(parsefn, *, description):
@@ -24,18 +26,39 @@ def import_main(parsefn, *, description):
2426
pickle.dump(result, f)
2527

2628

27-
def parse_multi_bench_file(logtext, *, benchstart, filename=None):
29+
def parse_multi_bench_file(logtext: str, *, benchstart: Union[Pattern, str], filename: Optional[Union[Pattern, str]] = None):
30+
if filename is not None:
31+
filename = re.compile(filename)
32+
benchstart = re.compile(benchstart)
33+
34+
def parse_bench(benchm: Match, nextm: Union[Match, _DummyEnd], is_first: bool = False):
35+
# The RE can specify any extra properties.
36+
info = benchm.groupdict()
37+
# If this is the first benchmark in the file, we want to start from the
38+
# start of the file so that we don't lose any information.
39+
start = 0 if is_first else benchm.start()
40+
end = nextm.start()
41+
return _parse_benchmark(info, logtext,
42+
start, end,
43+
filenamere=filename)
44+
45+
bench_matches = list(benchstart.finditer(logtext))
2846
benchmarks = []
29-
for benchm, nextm in _splititer(benchstart, logtext):
30-
bench = _parse_benchmark(benchm.groupdict(), logtext,
31-
benchm.end(), nextm.start(),
32-
filenamere=filename)
33-
benchmarks.append(bench)
47+
48+
is_first: bool = True
49+
for benchm, nextm in zip(
50+
bench_matches,
51+
[*bench_matches[1:], _DummyEnd(len(logtext))]
52+
):
53+
benchmarks.append(parse_bench(benchm, nextm, is_first))
54+
is_first = False
3455

3556
return Logs(benchmarks)
3657

3758

38-
def parse_single_bench_file(logtext, *, benchname, filename=None):
59+
def parse_single_bench_file(logtext, *, benchname, filename: Optional[Union[Pattern, str]] = None):
60+
if filename is not None:
61+
filename = re.compile(filename)
3962
return Logs([
4063
_parse_benchmark(
4164
{'name': benchname},
@@ -45,21 +68,10 @@ def parse_single_bench_file(logtext, *, benchname, filename=None):
4568
])
4669

4770

48-
_FileInfo = namedtuple('_FileInfo', ('filename', 'from_pos'))
49-
50-
51-
def _each_cons(iterable, n):
52-
'''
53-
Iterates over each consecutive n items of the iterable.
54-
55-
_each_cons((1, 2, 3, 4), 2) # (1, 2), (2, 3), (3, 4)
56-
'''
57-
iters = [None] * n
58-
iters[0] = iter(iterable)
59-
for i in range(1, n):
60-
iters[i - 1], iters[i] = itertools.tee(iters[i - 1])
61-
next(iters[i], None)
62-
return zip(*iters)
71+
@dataclass
72+
class _FileInfo:
73+
filename: Optional[str]
74+
from_pos: int
6375

6476

6577
class _DummyEnd:
@@ -73,65 +85,68 @@ def end(self):
7385
return self._end
7486

7587

76-
def _splititer(regex, text, pos=0, endpos=None):
77-
'''
78-
'Splits' the string by the regular expression, using an iterable.
79-
Returns both where the regex matches and where it matched next (or the end).
80-
'''
81-
if endpos is None:
82-
endpos = len(text) - 1
88+
def _filename_info(filenamere: Optional[Pattern], logtext: str, start: int, end: int) -> List[_FileInfo]:
89+
if filenamere is None:
90+
filenamere = re.compile(r'.^') # RE that doesn't match anything
91+
files = []
8392

84-
return _each_cons(
85-
itertools.chain(regex.finditer(text, pos, endpos),
86-
(_DummyEnd(endpos + 1),)),
87-
2
88-
)
93+
for filem in filenamere.finditer(logtext, start, end):
94+
filename = filem.group(1)
95+
filestart = filem.end()
96+
files.append(_FileInfo(filename=filename, from_pos=filestart))
8997

98+
return files
9099

91-
def _parse_benchmark(info, logtext: str, start, end, *, filenamere):
92-
NAME = info['name']
100+
101+
def _parse_benchmark(info: dict, logtext: str, start: int, end: int, *, filenamere: Optional[Pattern]):
102+
BENCHNAME = info['name']
93103

94104
blocks = []
95105

96-
if filenamere and filenamere.search(logtext, start, end):
97-
files = [
98-
*(_FileInfo(filename=r.group(1), from_pos=r.end())
99-
for r in filenamere.finditer(logtext, start, end)),
100-
_FileInfo(filename=None, from_pos=len(logtext)),
101-
][::-1]
102-
else:
103-
files = [
104-
_FileInfo(filename=None, from_pos=start),
105-
_FileInfo(filename=None, from_pos=len(logtext)),
106-
][::-1]
106+
files: List[_FileInfo] = _filename_info(filenamere, logtext, start, end)
107+
if not files:
108+
# We have an unknown file starting from the very beginning
109+
files = [_FileInfo(filename=None, from_pos=start)]
110+
111+
# Allow us to peek ahead by giving a dummy "file" at the end which will never match a block
112+
files.append(_FileInfo(filename=None, from_pos=end))
113+
assert len(files) >= 2
114+
file_pos = 0
115+
116+
block_matches1, block_matches2 = itertools.tee(_RE_REGION_DELIMITER.finditer(logtext, start, end))
117+
next(block_matches2) # Drop first
118+
block_matches2 = itertools.chain(block_matches2, (_DummyEnd(end),))
107119

108120
blocks = []
109121

110-
for regionm, nextm in _splititer(_RE_REGION_INFO, logtext, start, end):
111-
assert regionm.end() > files[-1].from_pos
112-
if regionm.end() > files[-2].from_pos:
113-
files.pop()
122+
is_first = True
123+
for regionm, nextm in zip(block_matches1, block_matches2):
124+
region_start = regionm.end()
125+
if region_start > files[file_pos + 1].from_pos:
126+
file_pos += 1
127+
128+
assert region_start > files[file_pos].from_pos
114129

115-
try:
116-
filename = files[-1].filename
117-
except NameError:
118-
filename = None
130+
filename = files[file_pos].filename if files[file_pos] else None
119131

120132
regioninfo = {
121-
'name': regionm['name'],
122133
'file': filename,
123-
'benchmark': NAME,
134+
'benchmark': BENCHNAME,
124135
}
125-
block = _parse_block(regioninfo, logtext,
126-
regionm.start() - 1, nextm.start())
127-
blocks.append(block)
136+
blk_start = start if is_first else regionm.start()
137+
blk_end = nextm.start()
138+
blocks.append(_parse_block(regioninfo, logtext,
139+
blk_start, blk_end))
140+
is_first = False
128141

129142
return Benchmark(info, blocks)
130143

131144

132145
def _parse_block(info, logtext: str, start, end):
133146
events = _parse_events(logtext, start, end)
134147
raw_log = logtext[start:end]
148+
assert 'ProcessDag' in events
149+
info['name'] = events['ProcessDag'][0]['name']
135150

136151
return Block(info, raw_log, events)
137152

0 commit comments

Comments
 (0)