Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving the Yelp Bean matching algorithm #300

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions api/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ packaging==23.2
# pyproject-api
# pytest
# tox
pandas==1.5.3
platformdirs==3.11.0
# via
# tox
Expand Down
1 change: 1 addition & 0 deletions api/requirements-minimal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ flask-api-utils
Flask-SQLAlchemy
httplib2
networkx
pandas
psycopg2-binary
pydantic
pytz
Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ markupsafe==2.1.3
# werkzeug
networkx==3.1
# via -r requirements-minimal.txt
pandas==1.5.3
psycopg2-binary==2.9.9
# via -r requirements-minimal.txt
pydantic==2.4.2
Expand Down
84 changes: 77 additions & 7 deletions api/tests/matching/match_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,29 @@ def test_generate_meetings_same_department(session, subscription):
preference = subscription.datetime[0]
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
session.add(user_pref)
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user1 = User(
id=1,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
session.add(user1)
user2 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user2 = User(
id=2,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
session.add(user2)
user_list = [user1, user2]
session.commit()
Expand All @@ -47,13 +67,53 @@ def test_generate_meetings_with_history(session, subscription):
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
session.add(user_pref)

user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user1 = User(
id=1,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
session.add(user1)
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user2 = User(
id=2,
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
session.add(user2)
user3 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user3 = User(
id=3,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="101",
languages="",
days_since_start=100,
employee_id="103",
location="UK, London",
)
session.add(user3)
user4 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user4 = User(
id=4,
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en",
days_since_start=100,
employee_id="104",
location="US, SF",
)
session.add(user4)

user_list = [user1, user2, user3, user4]
Expand Down Expand Up @@ -102,7 +162,17 @@ def test_no_re_matches(session):
users = []
num_users = 20
for i in range(0, num_users):
user = User(email=f"{i}@yelp.com", meta_data={"department": f"dept{i}"}, subscription_preferences=[user_pref])
user = User(
id=i,
email=f"{i}@yelp.com",
meta_data={"department": f"dept{i//2}"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en",
days_since_start=100,
employee_id=f"{100+i}",
location="",
)
session.add(user)
mr = MeetingRequest(user=user, meeting_spec=meeting_spec)
session.add(mr)
Expand Down
22 changes: 20 additions & 2 deletions api/tests/matching/match_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,26 @@ def test_generate_save_meetings(session, subscription):
pref_1 = SubscriptionDateTime(datetime=datetime.now() - timedelta(weeks=MEETING_COOLDOWN_WEEKS - 1))
subscription = MeetingSubscription(title="all engineering weekly", datetime=[pref_1])
user_pref = UserSubscriptionPreferences(preference=pref_1, subscription=subscription)
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user1 = User(
jeanne1994 marked this conversation as resolved.
Show resolved Hide resolved
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
user2 = User(
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
meeting_spec = MeetingSpec(meeting_subscription=subscription, datetime=pref_1.datetime)
mr1 = MeetingRequest(user=user1, meeting_spec=meeting_spec)
mr2 = MeetingRequest(user=user2, meeting_spec=meeting_spec)
Expand Down
5 changes: 5 additions & 0 deletions api/yelp_beans/logic/employee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from yelp_beans.models import Employee


def get_employee(work_email):
return Employee.query.filter(Employee.work_email == work_email).first()
95 changes: 95 additions & 0 deletions api/yelp_beans/matching/match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from datetime import datetime
from datetime import timedelta

import networkx as nx
import pandas as pd
from database import db

from yelp_beans.logic.config import get_config
Expand Down Expand Up @@ -86,3 +88,96 @@ def get_previous_meetings(subscription, cooldown=None):
disallowed_meetings = {tuple([meeting.id for meeting in meeting]) for meeting in disallowed_meetings}

return disallowed_meetings


def jaccard(list1, list2):
intersection = len(list(set(list1).intersection(list2)))
if intersection == 0:
return 1
else:
union = (len(list1) + len(list2)) - intersection
return float(intersection) / union


def get_pairwise_distance(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to make the attributes used configurable? I think it'd be great to have the choice of attributes to apply be something that can be configured differently for different subscriptions

user_pair,
org_graph,
employee_df,
max_tenure=1000,
):
"""
get the distance between two users.
The returned distance score is a linear combination of the multiple user attributes' distnace (normalized).
The importance of each attribute is considered equal.
User attribute considered:
1. team/function: distance in the org chart
2. location - country, city
3. tenure at Yelp
4. language

note: we considered using education and work experience, but think it likely correlates with the first attribute
"""
user_a, user_b = user_pair
user_a_attributes = dict(employee_df.loc[user_a])
user_b_attributes = dict(employee_df.loc[user_b])

distance = 0
dist_1 = nx.shortest_path_length(org_graph, user_a, user_b)
dist_1 = dist_1 / 10 # approx. min-max scaled
distance += dist_1

# location
try:
user_a_city, user_a_country = user_a_attributes["location"].split(", ")
except ValueError:
user_a_city, user_a_country = "unkown", user_a_attributes["location"]
conancain marked this conversation as resolved.
Show resolved Hide resolved
try:
user_b_city, user_b_country = user_b_attributes["location"].split(", ")
except ValueError:
user_b_city, user_b_country = "unkown", user_b_attributes["location"]
conancain marked this conversation as resolved.
Show resolved Hide resolved
country_dist = 0 if user_a_country == user_b_country else 1
city_dist = 0 if user_a_city == user_b_city else 1
dist_2 = country_dist + city_dist
dist_2 = dist_2 / 2 # min-max scaled
distance += dist_2

# tenure
dist_3 = abs(int(user_a_attributes["days_since_start"]) - int(user_b_attributes["days_since_start"]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tenure is a bit subjective. I don't have strong opinions here if it doesn't lead to starvation. Fundamental to this assumption is that tenured folks know each other and so optimize for meeting newer less tenured people.

I think this works for v1 but I'll be curious to hear feedback on whether folks not getting matched with similarly tenured people gets noticed. Perhaps eventually we should get to a place where we can ask users to tell us their preferences for matching

dist_3 = dist_3 / max_tenure
distance += dist_3

# language
lang_similarity = jaccard(user_a_attributes["languages"], user_b_attributes["languages"])
dist_4 = 1 - lang_similarity
distance += dist_4

return distance


def get_meeting_weights(allowed_meetings):
"""
generate distance score for each user pairs.
"""
meeting_to_weight = {}

# need to convert this to JSON to match the previous logic
db_query_result = db.session.query(User).all()
json_dump = [obj.serialize() for obj in db_query_result]
employees = pd.DataFrame(json_dump)

employees["languages"] = employees["languages"].apply(lambda x: x.split(", "))
employees = employees[["id", "manager_id", "days_since_start", "location", "languages", "email", "employee_id"]]
employees = employees.merge(
employees[["employee_id", "id"]], how="left", left_on="manager_id", right_on="employee_id", suffixes=("", "_manager")
)
employees = employees.set_index("id", drop=False)
max_tenure = max(employees["days_since_start"].astype(int))

# yelp employee network graph created through reporting line
G = nx.Graph()
G.add_edges_from(list(zip(employees["id"], employees["id_manager"])))
for user_pair in allowed_meetings:
users_distance_score = get_pairwise_distance(user_pair, org_graph=G, employee_df=employees.copy(), max_tenure=max_tenure)
meeting_to_weight[user_pair] = users_distance_score

return meeting_to_weight
12 changes: 6 additions & 6 deletions api/yelp_beans/matching/pair_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import networkx as nx

from yelp_beans.logic.user import user_preference
from yelp_beans.matching.match_utils import get_meeting_weights
from yelp_beans.matching.match_utils import get_previous_meetings


Expand Down Expand Up @@ -78,16 +79,15 @@ def construct_graph(user_ids, disallowed_meetings):
Yay graphs! Networkx will do all the work for us.
"""

# special weights that be put on the matching potential of each meeting,
# depending on heuristics for what makes a good/bad potential meeting.
meeting_to_weight = {}

# This creates the graph and the maximal matching set is returned.
# It does not return anyone who didn't get matched.
meetings = []
possible_meetings = {meeting for meeting in itertools.combinations(user_ids, 2)}
allowed_meetings = possible_meetings - disallowed_meetings
possible_meetings = {tuple(sorted(meeting)) for meeting in itertools.combinations(user_ids, 2)}
allowed_meetings = possible_meetings - {tuple(sorted(a)) for a in disallowed_meetings}

# special weights that be put on the matching potential of each meeting,
# depending on heuristics for what makes a good/bad potential meeting.
meeting_to_weight = get_meeting_weights(allowed_meetings)
for meeting in allowed_meetings:
weight = meeting_to_weight.get(meeting, 1.0)
meetings.append((*meeting, {"weight": weight}))
Expand Down
23 changes: 23 additions & 0 deletions api/yelp_beans/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,32 @@ class User(db.Model):
terminated = db.Column(db.Boolean, nullable=False, default=False)
subscription_preferences = db.relationship("UserSubscriptionPreferences")

# Additional fields for match algo
languages = db.Column(db.Text)
days_since_start = db.Column(db.Integer)
employee_id = db.Column(db.String())
location = db.Column(db.String())
manager_id = db.Column(db.String())

def get_username(self):
return self.email.split("@")[0]

def serialize(self):
return {
"id": self.id,
"email": self.email,
"first_name": self.first_name,
"last_name": self.last_name,
"photo_url": self.photo_url,
"meta_data": self.meta_data,
"terminated": self.terminated,
"languages": self.languages,
"days_since_start": self.days_since_start,
"employee_id": self.employee_id,
"location": self.location,
"manager_id": self.manager_id,
}


class MeetingSubscription(db.Model):
"""The base template for a meeting type, it is comprised of
Expand Down
Loading