Skip to content

Commit

Permalink
Merge pull request #23 from UPB-SS1/multiple_temp_files
Browse files Browse the repository at this point in the history
Multiple temp files
  • Loading branch information
JoseRZapata authored May 11, 2021
2 parents 3ca0dfa + e0af07e commit e19110e
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name="pycoornet",
version="0.4.2",
version="0.4.3",
description="Using Python Given a set of URLs, this packages detects coordinated link sharing behavior on social media and outputs the network of entities that performed such behaviour.",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
14 changes: 10 additions & 4 deletions src/pycoornet/crowdtangle.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ def get_shares(self, urls, url_column='url', date_column='date', platforms=('fac
Path("rawdata").mkdir(parents=True, exist_ok=True)
# for temporal file numbering
num =1
# if temp number is bigger than the number of urls
if temp_number > len(urls):
temp_number = len(urls)//2

# Progress bar tqdm
for i in tqdm(range(len(urls))):
Expand Down Expand Up @@ -176,8 +179,9 @@ def get_shares(self, urls, url_column='url', date_column='date', platforms=('fac
# concat data results in dataframe
ct_shares_df = ct_shares_df.append(df_full, ignore_index=True)

if temp_saves and i % temp_number == 0:
ct_shares_df.to_feather(os.path.join("rawdata",f"temp_{num}.feather"))
if temp_saves and i+1 % temp_number == 0:
num_str = (str(num)).zfill(4)
ct_shares_df.to_feather(os.path.join("rawdata",f"temp_{num_str}.feather"))
num+=1
ct_shares_df = pd.DataFrame()
#clean variables
Expand All @@ -197,10 +201,12 @@ def get_shares(self, urls, url_column='url', date_column='date', platforms=('fac
#concatenate files if temp_saves == True
if temp_saves and num > 1:
if len(ct_shares_df) > 0:
ct_shares_df.to_feather(os.path.join("rawdata",f"temp_{num}.feather"))
num_str = (str(num)).zfill(4)
ct_shares_df.to_feather(os.path.join("rawdata",f"temp_{num_str}.feather"))
del ct_shares_df
try:
ct_shares_df = pd.concat(map(pd.read_feather, glob.glob(os.path.join('rawdata', "*.feather"))))
logger.info("starting concatenation of all temp files")
ct_shares_df = pd.concat(map(pd.read_feather, sorted(glob.glob(os.path.join('rawdata', "*.feather")))))
except:
raise SystemExit("\n temporal ct_shares concat failed")

Expand Down

0 comments on commit e19110e

Please sign in to comment.