This repository was archived by the owner on Sep 27, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathindex.js
144 lines (108 loc) · 3.41 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
const fs = require('fs');
const zlib = require('zlib');
const axios = require('axios');
const dateFormat = require('dateformat');
const ndjson = require('iterable-ndjson');
const redis = require('./redis.js');
module.exports = {
toSubstrings,
indexUser,
getUsers
};
function toSubstrings(username) {
username = username.toLowerCase();
const substrings = [];
for(let i = 0; i < username.length; i++) {
substrings.push(username.slice(0, i));
}
return substrings;
}
async function indexUser(user) {
const substrings = toSubstrings(user);
for(const substring of substrings) {
// If there are 5 or less items then add substring to index list
if (await redis.scard(`index:${substring}:users`) <= 5) {
await redis.sadd(`index:${substring}:users`, user);
}
}
// Always add the full username to the index, and since its a
// set there should not be any duplicates :)
await redis.sadd(`index:${user.toLowerCase()}:users`, user);
}
/*
async function autocomplete(search) {
return redis.smembers(`index:${search}:users`);
}
*/
/*
async function importUsers(usersFile) {
let userIndex = 0;
const source = fs.createReadStream(usersFile);
// Parse usernames from file, and index them in Redis
for await (const obj of ndjson.parse(source)) {
const user = obj.actor_login;
if ((userIndex % 100) === 0) {
console.log(`user(${userIndex}): ${user}`);
}
await indexSubStrs(user);
userIndex++;
}
redis.quit();
}
*/
async function getArchive(target, i) {
let outputFilename = `${target}-${i}.json.gz`;
// this might not be working properly
console.log(`Downloading http://data.gharchive.org/${outputFilename}...`);
let {data} = await axios.get(`http://data.gharchive.org/${outputFilename}`, {
responseType: 'arraybuffer'
});
fs.writeFileSync(`./user_dumps/${outputFilename}`, data);
// turn json.gz file into .json file
const json = zlib.gunzipSync(data).toString();
fs.writeFileSync(`./user_dumps/${outputFilename.slice(0, -3)}`, json);
// parse json and get usernames
console.log(json.slice(0, 50));
return ndjson.parse(json);
}
// importUsers('./user_dumps/sample_users.json');
// https://stackoverflow.com/questions/7329978/how-to-list-all-github-users
// importUsers('./user_dumps/github_users_2015.json'); // 1/1/2015
// importUsers('./user_dumps/github_users_2016.json');
// importUsers('./user_dumps/github_users_2017.json');
// importUsers('./user_dumps/github_users_2018.json');
// importUsers('./user_dumps/github_users_2019.json'); // 7/25/2019
// TODO: Download, process, and index new usernames from gharchive.org
async function getUsers(startDate, limit = 100) {
const endDate = new Date(); // Today
let loop = startDate;
while (loop <= endDate) {
const target = dateFormat(loop, 'yyyy-mm-dd');
// Loop through 24 hours
for (let i = 0; i <= 24; i++) {
const archive = await getArchive(target, i);
const uniqueUsers = new Set();
for await (const obj of archive) {
const {actor: {login}} = obj;
uniqueUsers.add(login);
if(uniqueUsers.size >= limit) {
return uniqueUsers;
}
}
}
const newDate = loop.setDate(loop.getDate() + 1);
loop = new Date(newDate);
}
}
/* istanbul ignore next */
if(require.main === module) {
(async () => {
const startDate = new Date('2019-07-25');
const users = await getUsers(startDate);
for(const user of users) {
await indexUser(user);
}
console.log(`Indexed ${users.size} users.`);
redis.quit();
})();
}