For xpro, conditionally ingest topics for each resource depending on the formatting of the topics (prolearn or mit-learn)

mbertrand · mbertrand · commit 3119fd32c985 · 2024-09-26T09:46:56.000-04:00
diff --git a/learning_resources/etl/xpro.py b/learning_resources/etl/xpro.py
@@ -54,7 +54,11 @@ def _parse_datetime(value):
 
 def parse_topics(resource_data: dict) -> list[dict]:
     """
-    Get a list containing {"name": <topic>} dict objects
+    Get a list containing {"name": <topic>} dict objects.
+    May be a mix of prolearn and mit-learn topics.
+    If all prolearn topics, transform them to mit-learn topics.
+    Otherwise, ignore the prolearn topics and return only mit-learn topics
+
     Args:
         resource_data: course or program data
     Returns:
@@ -63,14 +67,18 @@ def parse_topics(resource_data: dict) -> list[dict]:
     extracted_topics = resource_data["topics"]
     if not extracted_topics:
         return []
-    return transform_topics(
-        [
-            {"name": topic["name"].split(":")[-1].strip()}
-            for topic in extracted_topics
-            if topic
-        ],
-        OfferedBy.xpro.name,
-    )
+    prolearn_topics = [topic for topic in extracted_topics if ":" in topic["name"]]
+    if len(prolearn_topics) == len(extracted_topics):
+        return transform_topics(
+            [
+                {"name": topic["name"].split(":")[-1].strip()}
+                for topic in extracted_topics
+                if topic
+            ],
+            OfferedBy.xpro.name,
+        )
+    else:
+        return [topic for topic in extracted_topics if ":" not in topic["name"]]
 
 
 def extract_programs():
diff --git a/learning_resources/etl/xpro_test.py b/learning_resources/etl/xpro_test.py
@@ -326,7 +326,23 @@ def test_program_run_start_date_value(
     )
 
 
-def test_parse_topics_data():
+@pytest.mark.parametrize(
+    ("raw_topics", "expected_topics"),
+    [
+        (["Technology:AI/Machine Learning", "Management"], ["Management"]),
+        (
+            ["Technology:AI/Machine Learning", "Business:Management"],
+            ["AI", "Machine Learning", "Management"],
+        ),
+        (["Machine Learning", "Management"], ["Machine Learning", "Management"]),
+        (["AI", "Machine Learning"], ["AI", "Machine Learning"]),
+        (
+            ["AI", "Machine Learning", "Technology:AI/Machine Learning"],
+            ["AI", "Machine Learning"],
+        ),
+    ],
+)
+def test_parse_topics_data(raw_topics, expected_topics):
     """Test that topics are correctly parsed from the xpro data"""
     offeror = LearningResourceOfferorFactory.create(is_xpro=True)
     LearningResourceTopicMappingFactory.create(
@@ -345,10 +361,8 @@ def test_parse_topics_data():
         topic_name="Management",
     )
     course_data = {
-        "topics": [{"name": "AI/Machine Learning"}, {"name": "Management"}],
+        "topics": [{"name": topic} for topic in raw_topics],
     }
-    assert sorted(parse_topics(course_data), key=lambda topic: topic["name"]) == [
-        {"name": "AI"},
-        {"name": "Machine Learning"},
-        {"name": "Management"},
-    ]
+    assert sorted(parse_topics(course_data), key=lambda topic: topic["name"]) == sorted(
+        [{"name": topic} for topic in expected_topics], key=lambda topic: topic["name"]
+    )
diff --git a/test_json/xpro_courses.json b/test_json/xpro_courses.json
@@ -9,7 +9,10 @@
     "courseruns": [],
     "next_run_id": null,
     "platform": "xPRO",
-    "topics": [{ "name": "Business:Leadership & Organizations" }],
+    "topics": [
+      { "name": "Organizations & Leadership" },
+      { "name": "Business:Leadership & Organizations" }
+    ],
     "format": "Online",
     "availability": "dated",
     "credits": "1.25"
@@ -38,7 +41,7 @@
       }
     ],
     "next_run_id": 49,
-    "topics": [{ "name": "Business:Leadership & Organizations" }],
+    "topics": [{ "name": "Organizations & Leadership" }],
     "format": "In person",
     "availability": "dated",
     "credits": "2.25"