Yelp · conancain · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/api/requirements-dev.txt b/api/requirements-dev.txt
@@ -45,6 +45,7 @@ packaging==23.2
     #   pyproject-api
     #   pytest
     #   tox
+pandas==1.5.3
 platformdirs==3.11.0
     # via
     #   tox

diff --git a/api/requirements-minimal.txt b/api/requirements-minimal.txt
@@ -5,6 +5,7 @@ flask-api-utils
 Flask-SQLAlchemy
 httplib2
 networkx
+pandas
 psycopg2-binary
 pydantic
 pytz

diff --git a/api/requirements.txt b/api/requirements.txt
@@ -53,6 +53,7 @@ markupsafe==2.1.3
     #   werkzeug
 networkx==3.1
     # via -r requirements-minimal.txt
+pandas==1.5.3
 psycopg2-binary==2.9.9
     # via -r requirements-minimal.txt
 pydantic==2.4.2

diff --git a/api/tests/matching/match_test.py b/api/tests/matching/match_test.py
@@ -25,9 +25,29 @@ def test_generate_meetings_same_department(session, subscription):
     preference = subscription.datetime[0]
     user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
     session.add(user_pref)
-    user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
+    user1 = User(
+        id=1,
+        email="[email protected]",
+        meta_data={"department": "dept"},
+        subscription_preferences=[user_pref],
+        manager_id="0",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="101",
+        location="UK, London",
+    )
     session.add(user1)
-    user2 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
+    user2 = User(
+        id=2,
+        email="[email protected]",
+        meta_data={"department": "dept"},
+        subscription_preferences=[user_pref],
+        manager_id="101",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="102",
+        location="CA, London",
+    )
     session.add(user2)
     user_list = [user1, user2]
     session.commit()
@@ -47,13 +67,53 @@ def test_generate_meetings_with_history(session, subscription):
     user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
     session.add(user_pref)
 
-    user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
+    user1 = User(
+        id=1,
+        email="[email protected]",
+        meta_data={"department": "dept"},
+        subscription_preferences=[user_pref],
+        manager_id="0",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="101",
+        location="UK, London",
+    )
     session.add(user1)
-    user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
+    user2 = User(
+        id=2,
+        email="[email protected]",
+        meta_data={"department": "dept2"},
+        subscription_preferences=[user_pref],
+        manager_id="101",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="102",
+        location="CA, London",
+    )
     session.add(user2)
-    user3 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
+    user3 = User(
+        id=3,
+        email="[email protected]",
+        meta_data={"department": "dept"},
+        subscription_preferences=[user_pref],
+        manager_id="101",
+        languages="",
+        days_since_start=100,
+        employee_id="103",
+        location="UK, London",
+    )
     session.add(user3)
-    user4 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
+    user4 = User(
+        id=4,
+        email="[email protected]",
+        meta_data={"department": "dept2"},
+        subscription_preferences=[user_pref],
+        manager_id="101",
+        languages="en",
+        days_since_start=100,
+        employee_id="104",
+        location="US, SF",
+    )
     session.add(user4)
 
     user_list = [user1, user2, user3, user4]
@@ -102,7 +162,17 @@ def test_no_re_matches(session):
     users = []
     num_users = 20
     for i in range(0, num_users):
-        user = User(email=f"{i}@yelp.com", meta_data={"department": f"dept{i}"}, subscription_preferences=[user_pref])
+        user = User(
+            id=i,
+            email=f"{i}@yelp.com",
+            meta_data={"department": f"dept{i//2}"},
+            subscription_preferences=[user_pref],
+            manager_id="101",
+            languages="en",
+            days_since_start=100,
+            employee_id=f"{100+i}",
+            location="",
+        )
         session.add(user)
         mr = MeetingRequest(user=user, meeting_spec=meeting_spec)
         session.add(mr)

diff --git a/api/tests/matching/match_utils_test.py b/api/tests/matching/match_utils_test.py
@@ -28,8 +28,26 @@ def test_generate_save_meetings(session, subscription):
     pref_1 = SubscriptionDateTime(datetime=datetime.now() - timedelta(weeks=MEETING_COOLDOWN_WEEKS - 1))
     subscription = MeetingSubscription(title="all engineering weekly", datetime=[pref_1])
     user_pref = UserSubscriptionPreferences(preference=pref_1, subscription=subscription)
-    user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
-    user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
+    user1 = User(
+        email="[email protected]",
+        meta_data={"department": "dept"},
+        subscription_preferences=[user_pref],
+        manager_id="0",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="101",
+        location="UK, London",
+    )
+    user2 = User(
+        email="[email protected]",
+        meta_data={"department": "dept2"},
+        subscription_preferences=[user_pref],
+        manager_id="101",
+        languages="en, fr",
+        days_since_start=100,
+        employee_id="102",
+        location="CA, London",
+    )
     meeting_spec = MeetingSpec(meeting_subscription=subscription, datetime=pref_1.datetime)
     mr1 = MeetingRequest(user=user1, meeting_spec=meeting_spec)
     mr2 = MeetingRequest(user=user2, meeting_spec=meeting_spec)

diff --git a/api/yelp_beans/logic/employee.py b/api/yelp_beans/logic/employee.py
@@ -0,0 +1,5 @@
+from yelp_beans.models import Employee
+
+
+def get_employee(work_email):
+    return Employee.query.filter(Employee.work_email == work_email).first()
diff --git a/api/yelp_beans/matching/match_utils.py b/api/yelp_beans/matching/match_utils.py
@@ -3,6 +3,8 @@
 from datetime import datetime
 from datetime import timedelta
 
+import networkx as nx
+import pandas as pd
 from database import db
 
 from yelp_beans.logic.config import get_config
@@ -86,3 +88,96 @@ def get_previous_meetings(subscription, cooldown=None):
     disallowed_meetings = {tuple([meeting.id for meeting in meeting]) for meeting in disallowed_meetings}
 
     return disallowed_meetings
+
+
+def jaccard(list1, list2):
+    intersection = len(list(set(list1).intersection(list2)))
+    if intersection == 0:
+        return 1
+    else:
+        union = (len(list1) + len(list2)) - intersection
+        return float(intersection) / union
+
+
+def get_pairwise_distance(
+    user_pair,
+    org_graph,
+    employee_df,
+    max_tenure=1000,
+):
+    """
+    get the distance between two users.
+    The returned distance score is a linear combination of the multiple user attributes' distnace (normalized).
+    The importance of each attribute is considered equal.
+    User attribute considered:
+    1. team/function: distance in the org chart
+    2. location - country, city
+    3. tenure at Yelp
+    4. language
+
+    note: we considered using education and work experience, but think it likely correlates with the first attribute
+    """
+    user_a, user_b = user_pair
+    user_a_attributes = dict(employee_df.loc[user_a])
+    user_b_attributes = dict(employee_df.loc[user_b])
+
+    distance = 0
+    dist_1 = nx.shortest_path_length(org_graph, user_a, user_b)
+    dist_1 = dist_1 / 10  # approx. min-max scaled
+    distance += dist_1
+
+    # location
+    try:
+        user_a_city, user_a_country = user_a_attributes["location"].split(", ")
+    except ValueError:
+        user_a_city, user_a_country = "unkown", user_a_attributes["location"]
+    try:
+        user_b_city, user_b_country = user_b_attributes["location"].split(", ")
+    except ValueError:
+        user_b_city, user_b_country = "unkown", user_b_attributes["location"]
+    country_dist = 0 if user_a_country == user_b_country else 1
+    city_dist = 0 if user_a_city == user_b_city else 1
+    dist_2 = country_dist + city_dist
+    dist_2 = dist_2 / 2  # min-max scaled
+    distance += dist_2
+
+    # tenure
+    dist_3 = abs(int(user_a_attributes["days_since_start"]) - int(user_b_attributes["days_since_start"]))
+    dist_3 = dist_3 / max_tenure
+    distance += dist_3
+
+    # language
+    lang_similarity = jaccard(user_a_attributes["languages"], user_b_attributes["languages"])
+    dist_4 = 1 - lang_similarity
+    distance += dist_4
+
+    return distance
+
+
+def get_meeting_weights(allowed_meetings):
+    """
+    generate distance score for each user pairs.
+    """
+    meeting_to_weight = {}
+
+    # need to convert this to JSON to match the previous logic
+    db_query_result = db.session.query(User).all()
+    json_dump = [obj.serialize() for obj in db_query_result]
+    employees = pd.DataFrame(json_dump)
+
+    employees["languages"] = employees["languages"].apply(lambda x: x.split(", "))
+    employees = employees[["id", "manager_id", "days_since_start", "location", "languages", "email", "employee_id"]]
+    employees = employees.merge(
+        employees[["employee_id", "id"]], how="left", left_on="manager_id", right_on="employee_id", suffixes=("", "_manager")
+    )
+    employees = employees.set_index("id", drop=False)
+    max_tenure = max(employees["days_since_start"].astype(int))
+
+    # yelp employee network graph created through reporting line
+    G = nx.Graph()
+    G.add_edges_from(list(zip(employees["id"], employees["id_manager"])))
+    for user_pair in allowed_meetings:
+        users_distance_score = get_pairwise_distance(user_pair, org_graph=G, employee_df=employees.copy(), max_tenure=max_tenure)
+        meeting_to_weight[user_pair] = users_distance_score
+
+    return meeting_to_weight
diff --git a/api/yelp_beans/matching/pair_match.py b/api/yelp_beans/matching/pair_match.py
@@ -4,6 +4,7 @@
 import networkx as nx
 
 from yelp_beans.logic.user import user_preference
+from yelp_beans.matching.match_utils import get_meeting_weights
 from yelp_beans.matching.match_utils import get_previous_meetings
 
 
@@ -78,16 +79,15 @@ def construct_graph(user_ids, disallowed_meetings):
     Yay graphs! Networkx will do all the work for us.
     """
 
-    # special weights that be put on the matching potential of each meeting,
-    # depending on heuristics for what makes a good/bad potential meeting.
-    meeting_to_weight = {}
-
     # This creates the graph and the maximal matching set is returned.
     # It does not return anyone who didn't get matched.
     meetings = []
-    possible_meetings = {meeting for meeting in itertools.combinations(user_ids, 2)}
-    allowed_meetings = possible_meetings - disallowed_meetings
+    possible_meetings = {tuple(sorted(meeting)) for meeting in itertools.combinations(user_ids, 2)}
+    allowed_meetings = possible_meetings - {tuple(sorted(a)) for a in disallowed_meetings}
 
+    # special weights that be put on the matching potential of each meeting,
+    # depending on heuristics for what makes a good/bad potential meeting.
+    meeting_to_weight = get_meeting_weights(allowed_meetings)
     for meeting in allowed_meetings:
         weight = meeting_to_weight.get(meeting, 1.0)
         meetings.append((*meeting, {"weight": weight}))

diff --git a/api/yelp_beans/models.py b/api/yelp_beans/models.py
@@ -23,9 +23,32 @@ class User(db.Model):
     terminated = db.Column(db.Boolean, nullable=False, default=False)
     subscription_preferences = db.relationship("UserSubscriptionPreferences")
 
+    # Additional fields for match algo
+    languages = db.Column(db.Text)
+    days_since_start = db.Column(db.Integer)
+    employee_id = db.Column(db.String())
+    location = db.Column(db.String())
+    manager_id = db.Column(db.String())
+
     def get_username(self):
         return self.email.split("@")[0]
 
+    def serialize(self):
+        return {
+            "id": self.id,
+            "email": self.email,
+            "first_name": self.first_name,
+            "last_name": self.last_name,
+            "photo_url": self.photo_url,
+            "meta_data": self.meta_data,
+            "terminated": self.terminated,
+            "languages": self.languages,
+            "days_since_start": self.days_since_start,
+            "employee_id": self.employee_id,
+            "location": self.location,
+            "manager_id": self.manager_id,
+        }
+
 
 class MeetingSubscription(db.Model):
     """The base template for a meeting type, it is comprised of