-
Notifications
You must be signed in to change notification settings - Fork 3
/
dvc.lock
121 lines (121 loc) · 3.6 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
schema: '2.0'
stages:
generate_dataset:
cmd: python -m zoning.data_processing.generate_dataset
deps:
- path: data/names_all_towns.json
md5: 0013040cfeb0f4f23a35a94cae9f2950
size: 2665
- path: data/textract_dataset
md5: 11062398489b76650f2d8335fe86775a.dir
size: 8441946344
nfiles: 181
- path: zoning/data_processing/generate_dataset.py
md5: 51f7f2eef57fa468b3d03fd2d8faa385
size: 4401
params:
params.yaml:
generate_dataset.seed: 42
generate_dataset.test_split_frac: 0.3
publish_datasets: true
outs:
- path: data/hf_dataset
hash: md5
md5: 302e323289b3182818324f7ac09702e9.dir
size: 5363614993
nfiles: 13
- path: data/parquet_dataset
hash: md5
md5: ce3b5231d58d404fcf15d265e11ccab8.dir
size: 2219366007
nfiles: 176
upload_zoning_docs:
cmd: bash zoning/data_processing/upload_zoning_docs.sh
deps:
- path: data/orig-documents
hash: md5
md5: 2311e7a6bb96f9a48b11de97f333475a.dir
size: 1004547167
nfiles: 181
- path: zoning/data_processing/upload_zoning_docs.sh
md5: 4bc9fb2d649525a0c3928fe4f7bc2307
size: 316
outs:
- path: data/orig_documents_s3_manifest.json
md5: 897ea14dfed07ed31455e60dc08c51c4
size: 9670
extract_text:
cmd: python -m zoning.data_processing.extract_text
deps:
- path: data/orig_documents_s3_manifest.json
md5: 897ea14dfed07ed31455e60dc08c51c4
size: 9670
- path: zoning/data_processing/extract_text.py
md5: 98404ae0ff0f6d2466c5e067543c2e41
size: 2788
params:
params.yaml:
extract_text.orig_document_s3_bucket: cornell-mfd64
outs:
- path: data/textract_dataset
md5: 11062398489b76650f2d8335fe86775a.dir
size: 8441946344
nfiles: 181
generate_text_dataset:
cmd: python -m zoning.data_processing.generate_text_dataset
deps:
- path: data/hf_dataset
hash: md5
md5: 302e323289b3182818324f7ac09702e9.dir
size: 5363614993
nfiles: 13
- path: zoning/data_processing/generate_text_dataset.py
md5: 45a291510a51b8886397868052138501
size: 3896
params:
params.yaml:
publish_datasets: true
outs:
- path: data/hf_text_dataset
hash: md5
md5: e832d3471ef79f057d9845b1c45015ee.dir
size: 472917787
nfiles: 7
index_towns:
cmd: python -m zoning.data_processing.index_towns
deps:
- path: data/hf_text_dataset
hash: md5
md5: e832d3471ef79f057d9845b1c45015ee.dir
size: 472917787
nfiles: 7
- path: zoning/data_processing/index_towns.py
hash: md5
md5: 0601f229376ae98b338a4b45cbd1c021
size: 1700
evaluate:
cmd: python -m zoning.data_processing.eval --num-eval-rows 10 --terms min_lot_size
--terms min_unit_size --terms max_height --terms max_lot_coverage --terms max_lot_coverage_pavement
--terms min_parking_spaces --search-method elasticsearch --extraction-method
map --k 12
deps:
- path: templates/extraction_chat_completion.pmpt.tpl
hash: md5
md5: c4e2265212d89c46f25fb476e1268be1
size: 6994
- path: templates/extraction_completion.pmpt.tpl
md5: cdd9b0b79ac4ed02da5897315ccc15df
size: 91
- path: zoning/data_processing/eval.py
hash: md5
md5: d5bf368ef9fe923c6b265c4b866ae1e2
size: 21553
outs:
- path: data/results/eval.parquet
hash: md5
md5: 2f7e6b01503a046bd080bc3dfbd03d59
size: 23110
- path: data/results/eval.yaml
hash: md5
md5: c5e8e9deddd300ddaac591c4a1d6ba83
size: 655