-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathshared.py
150 lines (118 loc) · 4.06 KB
/
shared.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 15 13:11:38 2014
@author: sunshine
"""
import os
import os.path as osp
import sys
import pandas as pd
import re
import datetime as dt
from unicodedata import normalize
def from_utc(utcTime,fmt="%Y-%m-%dT%H:%M:%S.%fZ"):
"""
Convert UTC time string to time.struct_time
"""
# change datetime.datetime to time, return time.struct_time type
return dt.datetime.strptime(utcTime, fmt)
def thread_pool(q, maxthreads, ThreadClass):
'''
Populates a threadpool in the given queue with the passed class.
q
Queue instance to populate with threads
maxthreads
Maximum number of threads that will be allowed in the queue
ThreadClass
Class that extends Thread class to be run
'''
for x in range(maxthreads):
t = ThreadClass(q)
t.setDaemon(True)
t.start()
def enc_str(utf):
'''
Converts utf-8 strings to ascii by dropping invalid characters.
'''
if isinstance(utf, unicode):
return normalize('NFKD', utf).encode('ascii', 'ignore')
else:
return str(utf)
def sort_clean(data):
'''
Sorts the rows by the loop count, drop duplicates, and resets the index.
'''
data_sorted = data.sort(columns=['count'], ascending=False)
data_cleaned = data_sorted.drop_duplicates(subset='permalinkUrl')
data_reindex = data_cleaned.reset_index(drop=True)
return data_reindex
def ap(path):
"""
Gets the absolute path of the directory and appends the path to it.
"""
return osp.join(osp.dirname(osp.abspath(sys.argv[0])), path)
def exists(data, directory):
"""
Checks all the id's of the vines to see if there is a corresponding file
in the specified directory, and if wrong directory, method returns
empty DataFrame.
"""
if directory in ['cache', 'render']:
#filter lambda for the dataframe
is_file = lambda vineid: osp.isfile(ap(directory + '/' + str(vineid) + '.mp4'))
datav = data[data['id'].map(is_file)]
return datav
else:
return pd.DataFrame()
def delete_file(path):
path = ap(path)
try:
if osp.isfile(path):
os.unlink(path)
except Exception as e:
print(e)
def load_top_n(n, name):
path = ap('meta/' + name + '.csv')
print(path)
if osp.isfile(path):
try:
df = pd.read_csv(path, encoding='utf-8', error_bad_lines=False)
return sort_clean(df).ix[:n - 1, :]
except Exception as e:
print(e)
def archive_metadata():
time = dt.datetime.now().strftime('%d-%m-%Y')
if not osp.isdir(ap('meta/archives')):
os.mkdir(ap('meta/archives'))
if not osp.isdir(ap('meta/archives/' + time)):
os.mkdir(ap('meta/archives/' + time))
for filename in os.listdir(ap('meta/')):
if osp.isfile(ap('meta/' + filename)):
if not re.match('playlists.csv', filename):
os.rename(ap('meta/' + filename),
ap('meta/archives/' + time + '/' + filename))
def flush_all():
"""
Gets rid of all files in the render and cache directories as well as
the vine records csv and leftover temp mp3 audio clips.
"""
directories = ['render/', 'cache/', 'meta/']
for directory in directories:
print('removing all files in: ' + directory)
for vfile in os.listdir(ap(directory)):
if not re.match('playlists.csv', vfile):
delete_file(directory + vfile)
for vfile in os.listdir(ap('')):
if vfile.endswith('.mp3'):
print('removing: ' + vfile)
delete_file(vfile)
def flush_render():
"""
Gets rid of all files in the render directory
"""
for directory in ['render/', 'render/finals/']:
print('removing all files in: ' + directory)
for vfile in os.listdir(ap(directory)):
delete_file(directory + vfile)
def group_data(data, group_size):
return [data[x:x+group_size] for x in range(0, len(data), group_size)]