-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTaskfile.yaml
167 lines (151 loc) · 4.67 KB
/
Taskfile.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# https://taskfile.dev
version: '3'
vars:
DATA_DIR: ./data
SRC_RESULTS_JSONL: "{{.DATA_DIR}}/src_github_results.jsonl"
SRC_RESULTS_CSV: "{{.DATA_DIR}}/src_github_results.csv"
NON_NPM_SRC_RESULTS_JSONL: "{{.DATA_DIR}}/non_npm_src_github_results.jsonl"
NSQD_HOST: nsqd
IN_PROGRESS_DB: ./data/working.db
FINAL_DB: ./data/packages.db
dotenv:
- .env
tasks:
brew:requirements:
desc: Install required utilities.
cmds:
- |-
brew install \
benthos \
jq \
ripgrep \
sourcegraph/src-cli/src-cli \
sqlite \
xsv
src:login:
desc: Test Sourcegraph CLI authentication.
cmds:
- src login
src:query:
desc: Query Sourcegraph for package.json files
summary: |
Query SourceGraph for package.json files.
SourceGraph query asks for all package.json files excluding files found in directories such
as node_modules, test, fixture, and examples. The returned results are filtered to contain
GitHub repositories and reformatting the repository field in the output.
cmds:
- |-
src search -stream -json '{{ .SRC_QUERY }}' \
| jq -c 'select(.type == "path") | select(.repository | test("^github.com"))' \
| jq -c '.repository = (.repository | sub("github.com/"; ""))' \
> {{ .SRC_RESULTS_JSONL }}
vars:
SRC_QUERY: >-
file:(^|/)package.json$
fork:no
archived:no
-file:(^|/)\.
-file:(^|/)(node_modules|test|tests|fixture|fixtures|examples|vendor)/
count:all
generates:
- "{{ .SRC_RESULTS_JSONL }}"
meta-db:create-tables:
cmds:
- |
sqlite-utils create-table --replace {{ .IN_PROGRESS_DB }} npm_package_repositories \
full_name text \
--not-null full_name \
--pk=full_name
- |
sqlite-utils create-table --replace {{ .IN_PROGRESS_DB }} src_results \
repository text \
path text \
commit text \
organization text \
--not-null repository \
--not-null path \
--not-null commit \
--not-null organization \
--pk=repository \
--pk=path \
--pk=commit
meta-db:insert:npm_package_repositories:
cmds:
- |
sqlite-utils insert --replace {{ .IN_PROGRESS_DB }} npm_package_repositories \
--csv ./npm/npm_package_github_repos.csv
meta-db:insert:src_results:
cmds:
- |
cat {{ .SRC_RESULTS_JSONL }} \
| jq -c '{path: .path, commit: .commit, repository: .repository, organization: (.repository / "/" | .[0])}' \
| sqlite-utils insert --replace {{ .IN_PROGRESS_DB }} src_results - --nl
meta-db:query:non-npm-packages:
cmds:
- |
sqlite-utils {{ .IN_PROGRESS_DB }} "{{ .QUERY }}" --nl \
> {{ .NON_NPM_SRC_RESULTS_JSONL }}
vars:
QUERY: >
SELECT
s.*
FROM
src_results AS s
LEFT JOIN npm_package_repositories AS r ON (s.repository = r.full_name)
WHERE
r.full_name IS NULL
ORDER BY
s.organization
meta-db:
cmds:
- task: meta-db:create-tables
- task: meta-db:insert:npm_package_repositories
- task: meta-db:insert:src_results
- task: meta-db:query:non-npm-packages
github:collect:enqueue:
desc: Enqueue package.json references into NSQ
cmds:
- |
cat {{ .NON_NPM_SRC_RESULTS_JSONL }} \
| to_nsq -nsqd-tcp-address={{ .NSQD_HOST }}:4150 -topic=package_json_references
github:collect:
desc: Consume package.json references and produce package.json files
cmds:
- |
benthos \
-c ./benthos/config/config.yml \
-r ./benthos/config/resources.yml \
streams \
./benthos/streams/github.yml
github:collect:to-disk:
desc: Persist enqueued package.json files to disk
cmds:
- |
nsq_to_file \
-topic=package_json \
-channel=to_file \
-nsqd-tcp-address={{ .NSQD_HOST }}:4150 \
-gzip \
-output-dir=./data
db:insert-npm-repositories:
desc: Insert NPM package repositories
cmds:
- |
sqlite-utils insert \
--replace \
--pk=full_name \
{{ .FINAL_DB }} \
npm_package_repositories \
--csv ./npm/npm_package_github_repos.csv
db:insert-packages:
desc: Insert non-NPM packages into SQLite DB
cmds:
- |
gzcat {{ .DATA_DIR }}/package_json.*.log.gz \
| sqlite-utils insert \
--nl \
--pk=repository \
--pk=path \
{{ .FINAL_DB }} \
non_npm_packages \
-