Skip to content

Commit 3fd4435

Browse files
committed
reformat python files for uniform styling
1 parent 1ea6932 commit 3fd4435

26 files changed

+2479
-1478
lines changed

Diff for: CSharpExtractor/extract.py

+45-15
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,38 @@
1212
from subprocess import Popen, PIPE, STDOUT, call
1313

1414

15-
1615
def get_immediate_subdirectories(a_dir):
17-
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
18-
if os.path.isdir(os.path.join(a_dir, name))]
16+
return [
17+
(os.path.join(a_dir, name))
18+
for name in os.listdir(a_dir)
19+
if os.path.isdir(os.path.join(a_dir, name))
20+
]
1921

2022

2123
TMP_DIR = ""
2224

25+
2326
def ParallelExtractDir(args, dir):
2427
ExtractFeaturesForDir(args, dir, "")
2528

2629

2730
def ExtractFeaturesForDir(args, dir, prefix):
28-
command = ['dotnet', 'run', '--project', args.csproj,
29-
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
30-
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]
31-
31+
command = [
32+
"dotnet",
33+
"run",
34+
"--project",
35+
args.csproj,
36+
"--max_length",
37+
str(args.max_path_length),
38+
"--max_width",
39+
str(args.max_path_width),
40+
"--path",
41+
dir,
42+
"--threads",
43+
str(args.num_threads),
44+
"--ofile_name",
45+
str(args.ofile_name),
46+
]
3247

3348
# print command
3449
# os.system(command)
@@ -46,15 +61,16 @@ def ExtractFeaturesForDir(args, dir, prefix):
4661
if len(stderr) > 0:
4762
print(sys.stderr, stderr)
4863
else:
49-
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
64+
print(sys.stderr, "dir: " + str(dir) + " was not completed in time")
5065
failed = True
5166
subdirs = get_immediate_subdirectories(dir)
5267
for subdir in subdirs:
53-
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
68+
ExtractFeaturesForDir(args, subdir, prefix + dir.split("/")[-1] + "_")
5469
if failed:
5570
if os.path.exists(str(args.ofile_name)):
5671
os.remove(str(args.ofile_name))
5772

73+
5874
def ExtractFeaturesForDirsList(args, dirs):
5975
global TMP_DIR
6076
TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
@@ -64,7 +80,7 @@ def ExtractFeaturesForDirsList(args, dirs):
6480
try:
6581
p = multiprocessing.Pool(4)
6682
p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
67-
#for dir in dirs:
83+
# for dir in dirs:
6884
# ExtractFeaturesForDir(args, dir, '')
6985
output_files = os.listdir(TMP_DIR)
7086
for f in output_files:
@@ -73,12 +89,26 @@ def ExtractFeaturesForDirsList(args, dirs):
7389
shutil.rmtree(TMP_DIR, ignore_errors=True)
7490

7591

76-
if __name__ == '__main__':
92+
if __name__ == "__main__":
7793

7894
parser = ArgumentParser()
79-
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
80-
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
81-
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
95+
parser.add_argument(
96+
"-maxlen",
97+
"--max_path_length",
98+
dest="max_path_length",
99+
required=False,
100+
default=8,
101+
)
102+
parser.add_argument(
103+
"-maxwidth",
104+
"--max_path_width",
105+
dest="max_path_width",
106+
required=False,
107+
default=2,
108+
)
109+
parser.add_argument(
110+
"-threads", "--num_threads", dest="num_threads", required=False, default=64
111+
)
82112
parser.add_argument("--csproj", dest="csproj", required=True)
83113
parser.add_argument("-dir", "--dir", dest="dir", required=False)
84114
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
@@ -88,5 +118,5 @@ def ExtractFeaturesForDirsList(args, dirs):
88118
subdirs = get_immediate_subdirectories(args.dir)
89119
to_extract = subdirs
90120
if len(subdirs) == 0:
91-
to_extract = [args.dir.rstrip('/')]
121+
to_extract = [args.dir.rstrip("/")]
92122
ExtractFeaturesForDirsList(args, to_extract)

Diff for: Python150kExtractor/extract.py

+42-41
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,23 @@
1010
from pathlib import Path
1111
from sklearn import model_selection as sklearn_model_selection
1212

13-
METHOD_NAME, NUM = 'METHODNAME', 'NUM'
13+
METHOD_NAME, NUM = "METHODNAME", "NUM"
1414

1515
parser = argparse.ArgumentParser()
16-
parser.add_argument('--data_dir', required=True, type=str)
17-
parser.add_argument('--valid_p', type=float, default=0.2)
18-
parser.add_argument('--max_path_length', type=int, default=8)
19-
parser.add_argument('--max_path_width', type=int, default=2)
20-
parser.add_argument('--use_method_name', type=bool, default=True)
21-
parser.add_argument('--use_nums', type=bool, default=True)
22-
parser.add_argument('--output_dir', required=True, type=str)
23-
parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count())
24-
parser.add_argument('--seed', type=int, default=239)
16+
parser.add_argument("--data_dir", required=True, type=str)
17+
parser.add_argument("--valid_p", type=float, default=0.2)
18+
parser.add_argument("--max_path_length", type=int, default=8)
19+
parser.add_argument("--max_path_width", type=int, default=2)
20+
parser.add_argument("--use_method_name", type=bool, default=True)
21+
parser.add_argument("--use_nums", type=bool, default=True)
22+
parser.add_argument("--output_dir", required=True, type=str)
23+
parser.add_argument("--n_jobs", type=int, default=multiprocessing.cpu_count())
24+
parser.add_argument("--seed", type=int, default=239)
2525

2626

2727
def __collect_asts(json_file):
2828
asts = []
29-
with open(json_file, 'r', encoding='utf-8') as f:
29+
with open(json_file, "r", encoding="utf-8") as f:
3030
for line in f:
3131
ast = json.loads(line.strip())
3232
asts.append(ast)
@@ -42,22 +42,22 @@ def dfs(v):
4242

4343
v_node = ast[v]
4444

45-
if 'value' in v_node:
45+
if "value" in v_node:
4646
if v == node_index: # Top-level func def node.
4747
if args.use_method_name:
4848
paths.append((stack.copy(), METHOD_NAME))
4949
else:
50-
v_type = v_node['type']
50+
v_type = v_node["type"]
5151

52-
if v_type.startswith('Name'):
53-
paths.append((stack.copy(), v_node['value']))
54-
elif args.use_nums and v_type == 'Num':
52+
if v_type.startswith("Name"):
53+
paths.append((stack.copy(), v_node["value"]))
54+
elif args.use_nums and v_type == "Num":
5555
paths.append((stack.copy(), NUM))
5656
else:
5757
pass
5858

59-
if 'children' in v_node:
60-
for child in v_node['children']:
59+
if "children" in v_node:
60+
for child in v_node["children"]:
6161
dfs(child)
6262

6363
stack.pop()
@@ -84,12 +84,13 @@ def __raw_tree_paths(ast, node_index, args):
8484

8585
tree_paths = []
8686
for (v_path, v_value), (u_path, u_value) in itertools.combinations(
87-
iterable=tnodes,
88-
r=2,
87+
iterable=tnodes,
88+
r=2,
8989
):
9090
prefix, lca, suffix = __merge_terminals2_paths(v_path, u_path)
91-
if (len(prefix) + 1 + len(suffix) <= args.max_path_length) \
92-
and (abs(len(prefix) - len(suffix)) <= args.max_path_width):
91+
if (len(prefix) + 1 + len(suffix) <= args.max_path_length) and (
92+
abs(len(prefix) - len(suffix)) <= args.max_path_width
93+
):
9394
path = prefix + [lca] + suffix
9495
tree_path = v_value, path, u_value
9596
tree_paths.append(tree_path)
@@ -103,49 +104,49 @@ def __delim_name(name):
103104

104105
def camel_case_split(identifier):
105106
matches = re.finditer(
106-
'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
107+
".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)",
107108
identifier,
108109
)
109110
return [m.group(0) for m in matches]
110111

111112
blocks = []
112-
for underscore_block in name.split('_'):
113+
for underscore_block in name.split("_"):
113114
blocks.extend(camel_case_split(underscore_block))
114115

115-
return '|'.join(block.lower() for block in blocks)
116+
return "|".join(block.lower() for block in blocks)
116117

117118

118119
def __collect_sample(ast, fd_index, args):
119120
root = ast[fd_index]
120-
if root['type'] != 'FunctionDef':
121-
raise ValueError('Wrong node type.')
121+
if root["type"] != "FunctionDef":
122+
raise ValueError("Wrong node type.")
122123

123-
target = root['value']
124+
target = root["value"]
124125

125126
tree_paths = __raw_tree_paths(ast, fd_index, args)
126127
contexts = []
127128
for tree_path in tree_paths:
128129
start, connector, finish = tree_path
129130

130131
start, finish = __delim_name(start), __delim_name(finish)
131-
connector = '|'.join(ast[v]['type'] for v in connector)
132+
connector = "|".join(ast[v]["type"] for v in connector)
132133

133-
context = f'{start},{connector},{finish}'
134+
context = f"{start},{connector},{finish}"
134135
contexts.append(context)
135136

136137
if len(contexts) == 0:
137138
return None
138139

139140
target = __delim_name(target)
140-
context = ' '.join(contexts)
141+
context = " ".join(contexts)
141142

142-
return f'{target} {context}'
143+
return f"{target} {context}"
143144

144145

145146
def __collect_samples(ast, args):
146147
samples = []
147148
for node_index, node in enumerate(ast):
148-
if node['type'] == 'FunctionDef':
149+
if node["type"] == "FunctionDef":
149150
sample = __collect_sample(ast, node_index, args)
150151
if sample is not None:
151152
samples.append(sample)
@@ -160,18 +161,18 @@ def __collect_all_and_save(asts, args, output_file):
160161
samples = parallel(func(ast, args) for ast in tqdm.tqdm(asts))
161162
samples = list(itertools.chain.from_iterable(samples))
162163

163-
with open(output_file, 'w') as f:
164+
with open(output_file, "w") as f:
164165
for line_index, line in enumerate(samples):
165-
f.write(line + ('' if line_index == len(samples) - 1 else '\n'))
166+
f.write(line + ("" if line_index == len(samples) - 1 else "\n"))
166167

167168

168169
def main():
169170
args = parser.parse_args()
170171
np.random.seed(args.seed)
171172

172173
data_dir = Path(args.data_dir)
173-
trains = __collect_asts(data_dir / 'python100k_train.json')
174-
evals = __collect_asts(data_dir / 'python50k_eval.json')
174+
trains = __collect_asts(data_dir / "python100k_train.json")
175+
evals = __collect_asts(data_dir / "python50k_eval.json")
175176

176177
train, valid = sklearn_model_selection.train_test_split(
177178
trains,
@@ -182,12 +183,12 @@ def main():
182183
output_dir = Path(args.output_dir)
183184
output_dir.mkdir(exist_ok=True)
184185
for split_name, split in zip(
185-
('train', 'valid', 'test'),
186-
(train, valid, test),
186+
("train", "valid", "test"),
187+
(train, valid, test),
187188
):
188-
output_file = output_dir / f'{split_name}_output_file.txt'
189+
output_file = output_dir / f"{split_name}_output_file.txt"
189190
__collect_all_and_save(split, args, output_file)
190191

191192

192-
if __name__ == '__main__':
193+
if __name__ == "__main__":
193194
main()

Diff for: args.py

+44-14
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,51 @@ def read_args():
55
parser = ArgumentParser()
66

77
group = parser.add_mutually_exclusive_group(required=True)
8-
group.add_argument("-d", "--data", dest="data_path",
9-
help="path to preprocessed dataset")
10-
group.add_argument("-l", "--load_path", dest="load_path",
11-
help="path to load model files", metavar="FILE")
8+
group.add_argument(
9+
"-d", "--data", dest="data_path", help="path to preprocessed dataset"
10+
)
11+
group.add_argument(
12+
"-l",
13+
"--load_path",
14+
dest="load_path",
15+
help="path to load model files",
16+
metavar="FILE",
17+
)
1218

13-
parser.add_argument("-m", "--model_path", dest="model_path",
14-
help="path to save and load checkpoints", metavar="FILE", required=False)
15-
parser.add_argument("-s", "--save_path", dest="save_path",
16-
help="path to save model files", metavar="FILE", required=False)
19+
parser.add_argument(
20+
"-m",
21+
"--model_path",
22+
dest="model_path",
23+
help="path to save and load checkpoints",
24+
metavar="FILE",
25+
required=False,
26+
)
27+
parser.add_argument(
28+
"-s",
29+
"--save_path",
30+
dest="save_path",
31+
help="path to save model files",
32+
metavar="FILE",
33+
required=False,
34+
)
1735

18-
parser.add_argument("-t", "--test", dest="test_path",
19-
help="path to test file", metavar="FILE", required=False)
36+
parser.add_argument(
37+
"-t",
38+
"--test",
39+
dest="test_path",
40+
help="path to test file",
41+
metavar="FILE",
42+
required=False,
43+
)
2044

21-
parser.add_argument('-p', '--predict', dest='predict', type=str, default='java',
22-
help='starts prediction mode, argument is "cpp" or "java" dependin on language model')
23-
parser.add_argument('--debug', action='store_true')
24-
parser.add_argument('--seed', type=int, default=239)
45+
parser.add_argument(
46+
"-p",
47+
"--predict",
48+
dest="predict",
49+
type=str,
50+
default="java",
51+
help='starts prediction mode, argument is "cpp" or "java" dependin on language model',
52+
)
53+
parser.add_argument("--debug", action="store_true")
54+
parser.add_argument("--seed", type=int, default=239)
2555
return parser.parse_args()

Diff for: baseline_tokenization/javalang/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
from . import parser
32
from . import parse
43
from . import tokenizer

0 commit comments

Comments
 (0)