Skip to content

Commit

Permalink
Merge pull request #6 from TogetherCrew/feat/5-improve-heatmaps-compu…
Browse files Browse the repository at this point in the history
…tation-time

fix: reduced number of users iterating over!
  • Loading branch information
amindadgar authored Jul 16, 2024
2 parents 2ce872a + 8505463 commit cd6be24
Show file tree
Hide file tree
Showing 15 changed files with 345 additions and 114 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="tc-analyzer-lib",
version="1.1.1",
version="1.2.0",
author="Mohammad Amin Dadgar, TogetherCrew",
maintainer="Mohammad Amin Dadgar",
maintainer_email="[email protected]",
Expand Down
47 changes: 35 additions & 12 deletions tc_analyzer_lib/metrics/heatmaps/heatmaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,28 +70,52 @@ def start(self, from_start: bool = False) -> list[dict]:
# initialize the data array
heatmaps_results = []

users_count = self.utils.get_users_count()

iteration_count = self._compute_iteration_counts(
analytics_date=analytics_date,
resources_count=len(self.resources),
authors_count=users_count,
)

cursor = self.utils.get_users(is_bot=True)
bot_ids = list(map(lambda user: user["id"], cursor))

index = 0
while analytics_date.date() < datetime.now().date():
for resource_id in self.resources:
# for more efficient retrieval
# we're always using the cursor and re-querying the db
user_ids_cursor = self.utils.get_users()

for author in user_ids_cursor:
start_day = analytics_date.replace(
hour=0, minute=0, second=0, microsecond=0
)
end_day = start_day + timedelta(days=1)
user_ids = self.utils.get_active_users(
start_day,
end_day,
metadata_filter={
"metadata."
+ self.analyzer_config.resource_identifier: resource_id,
},
)
if len(user_ids) == 0:
logging.warning(
f"{log_prefix} No users interacting for the time window: "
f"{start_day.date()} - {end_day.date()}"
" Skipping the day."
)

for idx, author_id in enumerate(user_ids):
logging.info(
f"{log_prefix} ANALYZING HEATMAPS {index}/{iteration_count}"
f"{log_prefix} ANALYZING HEATMAPS {index}/{iteration_count} "
f"author index: {idx}/{len(user_ids)} | "
f"DAY: {start_day.date()} - {end_day.date()}"
)
index += 1

author_id = author["id"]
if author_id in bot_ids:
logging.warning(
f"User id: {author_id} is bot, Skipping analytics for it"
)
continue

doc_date = analytics_date.date()
document = {
self.analyzer_config.resource_identifier: resource_id,
Expand All @@ -113,6 +137,8 @@ def start(self, from_start: bool = False) -> list[dict]:

heatmaps_results.append(document)

index += 1

# analyze next day
analytics_date += timedelta(days=1)

Expand Down Expand Up @@ -251,10 +277,7 @@ def _compute_iteration_counts(
self,
analytics_date: datetime,
resources_count: int,
authors_count: int,
) -> int:
iteration_count = (
(datetime.now() - analytics_date).days * resources_count * authors_count
)
iteration_count = (datetime.now() - analytics_date).days * resources_count

return iteration_count
77 changes: 77 additions & 0 deletions tc_analyzer_lib/metrics/heatmaps/heatmaps_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,83 @@ def get_users(self, is_bot: bool = False) -> Cursor:
)
return cursor

def get_active_users(
self, start_day: datetime, end_day: datetime, metadata_filter: dict = {}
) -> list[str]:
"""
get the users doing activities for a specific period
Parameters
-------------
start_day : datetime
the time to filter the data from
end_day : datetime
the end day for filtering data from
metadata_filter : dict
the additional filtering to be applied on data
default is no filtering which an empty dictionary will be passed
Returns
---------
users : list[str]
a list of user ids doing activity in that day
"""
# cursor = self.database["rawmemberactivities"].aggregate(
# [
# {"$match": {"date": {"$gte": start_day, "$lt": end_day}}},
# {"$unwind": "$interactions"},
# {"$unwind": "$interactions.users_engaged_id"},
# {
# "$group": {
# "_id": None,
# "all_ids": {"$addToSet": "$interactions.users_engaged_id"},
# "author_ids": {"$addToSet": "$author_id"},
# }
# },
# {
# "$project": {
# "_id": 0,
# "combined_ids": {"$setUnion": ["$all_ids", "$author_ids"]},
# }
# },
# ]
# )

cursor = self.database["rawmemberactivities"].aggregate(
[
{
"$match": {
"date": {"$gte": start_day, "$lt": end_day},
"metadata.bot_activity": False,
**metadata_filter,
}
},
{
"$group": {
"_id": None,
"all_ids": {"$addToSet": "$interactions.users_engaged_id"},
"author_ids": {"$addToSet": "$author_id"},
}
},
{
"$project": {
"_id": 0,
"combined_engaged_ids": {"$setUnion": ["$all_ids"]},
"combined_author_ids": {"$setUnion": ["$author_ids"]},
}
},
]
)

combined_ids = []
for doc in cursor:
combined_ids.extend(doc.get("combined_author_ids", []))
nested_list = doc.get("combined_engaged_ids", [])
combined_ids.extend(sum(sum(nested_list, []), []))

# making the values to be unique
return list(set(combined_ids))

def get_users_count(self, is_bot: bool = False) -> int:
"""
get the count of users
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def test_analyzer_week_period_recompute_available_analytics():
heatmaps_data = create_empty_heatmaps_data(start_day, count=1)
db_access.db_mongo_client[platform_id]["heatmaps"].insert_many(heatmaps_data)

yesterday = (datetime.now() - timedelta(days=1)).replace(
hour=0, minute=0, second=0, microsecond=0
)

# generating rawinfo samples
rawinfo_samples = []

Expand All @@ -59,7 +63,7 @@ def test_analyzer_week_period_recompute_available_analytics():
{
"actions": [{"name": "message", "type": "emitter"}],
"author_id": author,
"date": datetime.now() - timedelta(hours=i),
"date": yesterday - timedelta(hours=i),
"interactions": [
{
"name": "reply",
Expand All @@ -77,7 +81,7 @@ def test_analyzer_week_period_recompute_available_analytics():
{
"actions": [],
"author_id": replied_user,
"date": datetime.now() - timedelta(hours=i),
"date": yesterday - timedelta(hours=i),
"interactions": [
{"name": "reply", "type": "receiver", "users_engaged_id": [author]}
],
Expand All @@ -99,9 +103,6 @@ def test_analyzer_week_period_recompute_available_analytics():

memberactivities_cursor = db_access.query_db_find("memberactivities", {})
memberactivities_data = list(memberactivities_cursor)
yesterday = (datetime.now() - timedelta(days=1)).replace(
hour=0, minute=0, second=0, microsecond=0
)

memberactivities_expected_dates = [
yesterday,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,13 @@ def test_analyzer_week_period_run_once_empty_analytics():
rawinfo_samples = []

# generating random rawinfo data
for i in range(150):
for i in range(155):
author = np.random.choice(acc_id)
replied_user = np.random.choice(acc_id)
# not producing any self-interactions
while replied_user == author:
replied_user = np.random.choice(acc_id)

samples = [
{
"actions": [{"name": "message", "type": "emitter"}],
Expand Down
15 changes: 13 additions & 2 deletions tests/integration/test_exclude_bots.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,17 @@ def test_excluding_bots_heatmaps():
for i in range(720):
author = acc_id[i % len(acc_id)]
replied_user = np.random.choice(acc_id)

if author in ["bot0", "bot1", "bot2"]:
author_bot_activity = True
else:
author_bot_activity = False

if replied_user in ["bot0", "bot1", "bot2"]:
replied_bot_activity = True
else:
replied_bot_activity = False

samples = [
{
"actions": [{"name": "message", "type": "emitter"}],
Expand All @@ -60,7 +71,7 @@ def test_excluding_bots_heatmaps():
}
],
"metadata": {
"bot_activity": False,
"bot_activity": author_bot_activity,
"channel_id": "1020707129214111827",
"thread_id": None,
},
Expand All @@ -74,7 +85,7 @@ def test_excluding_bots_heatmaps():
{"name": "reply", "type": "receiver", "users_engaged_id": [author]}
],
"metadata": {
"bot_activity": False,
"bot_activity": replied_bot_activity,
"channel_id": "1020707129214111827",
"thread_id": None,
},
Expand Down
Loading

0 comments on commit cd6be24

Please sign in to comment.