faster matching by fetching transcript once + better lecture matching

adaluong · Sep 12, 2021 · eab32c1 · eab32c1
1 parent f2d2522
commit eab32c1
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 83 deletions.
diff --git a/get_chat.py b/get_chat.py
@@ -9,15 +9,15 @@ def id_to_chat(videoId: str) -> list:
     return chat_list
 
 def id_to_chat_split(videoId: str) -> list:
+
     """ return a list of dictionaries consisting of the chat message, user, 
     timestamp, and moderator status"""
-    
+
     chat = ChatDownloader().get_chat(f'https://www.youtube.com/watch?v={videoId}')
     chat_list = []
     for message in chat:
-        msg = {
-
-        }
+
+        msg = { }
 
         msg['time'] = message['time_text']
         msg['text'] = message['message']

diff --git a/get_transcript.py b/get_transcript.py
@@ -1,17 +1,9 @@
 from youtube_transcript_api import YouTubeTranscriptApi
-from youtube_transcript_api.formatters import JSONFormatter
-
-video_id ='Vi231_PujYI'
-video_id = '4WBbrxZguqk' # lecture 7 21T1
 
 def id_to_transcript(video_id: str) -> list:
-    transcript = YouTubeTranscriptApi.get_transcript(video_id)
-    # for message in transcript:
-    #     print(message)
-    text = [message['text'] for message in transcript]
+    """ given a YouTube video ID return a list of dictionaries 
+    containing the transcript text and the timestamp for the speech """
 
-    return text
-
-if __name__ == "__main__":
-    text = id_to_transcript(video_id)
-    print(text)
+    transcript = YouTubeTranscriptApi.get_transcript(video_id)
+
+    return [message['text'] for message in transcript]
diff --git a/get_video.py b/get_video.py
@@ -3,8 +3,8 @@
 from errors import CouldNotGetName
 
 def id_to_name(video_id) -> str:
+    """ get the title of a YouTube video given its video id """
     vid = Video.getInfo(video_id, mode=ResultMode.json)
-
     try:
         vid = json.loads(vid)
     except:

diff --git a/magic.py b/magic.py
@@ -1,60 +1,48 @@
-# lecture repeats the question
+# matches questions from live chat with responses from lecturer/speaker 
+
 from fuzzywuzzy import fuzz
-import get_transcript
-import get_chat
 
-def magic(transcript, chat):
-    keywords = ["ask", "questions", "ask", "question"]
-    qna = []
+def is_responding_to_chat(transcript_text, i):
+    """ determines if speech from the transcript is in response to the chat"""
 
-    for i in range(len(transcript) - 20):
-        line = transcript[i]
-        line_set = set(line.split(" "))
-        keywords_set = set(keywords)
-
-        if not keywords_set.isdisjoint(line_set):
-            question = transcript[i:i+3]
-            answer = transcript[i+3:i+20]
-
-            # print(f'Question: {" ".join(question)}\n')
-            # print(f'Answer: {" ".join(answer)}\n')
-
-            for comment in chat:
-                time = comment.split(" | ")[0]
-                potential_question = comment.split(" | ")[1]
-                if (fuzz.ratio(question, potential_question) > 48):
-                    print(f'{potential_question}')
-                    print(f'Question: {" ".join(question)}') 
-                    print(f'Answer: {" ".join(answer)}\n')
-                    qna.append(
-                        {
-                            "question": " ".join(question),
-                            "answer": " ".join(answer),
-                            "time": time
-                        }
-                    )
-
-    return qna
-
-if __name__ == '__main__':
-    with open("transcript.txt") as f:
-        transcript = [line.strip() for line in f.readlines()]
-
-    with open("chat.txt") as g:
-        chat = [line.strip() for line in g.readlines()]
-
-    videoId = "4WBbrxZguqk"
-    transcript = get_transcript.id_to_transcript(videoId)
-    chat = get_chat.id_to_chat(videoId)
+    keywords_set = {"ask", "asking", "asks", "asked", "question", "questions", 
+            "chat", "chats", "messages"}
+
+    transcript_line = transcript_text[i]
+    transcript_line_set = set(transcript_line.split(" "))
 
-    print(magic(transcript,chat))
+    if not keywords_set.isdisjoint(transcript_line_set):
+        speech = transcript_text[i:i+3]
+        answer = transcript_text[i+3:i+20]
+        return (speech, answer)
+
+    return (None, None)
 
-    marc = "what about alphabetical ordering of matt and mark"
-    chat = "What about alphabetical ordering of matt and marc"
-    # print(fuzz.ratio(marc, chat))
+def magic(transcript, chat):
+    """ matches questions from live chats with answers from lecture video"""
+    qna = {}
 
-    chat2 = "all of the students will do final exam at the same time?"
+    for i in range(len(transcript) - 20):
 
-    marc = "is it just returning one zero or negative one or does it return the magnitude of the difference"
-    chat = "Is it just returning 1/0/-1 or does it return the magnitude of the diffference?"
-    # print(fuzz.ratio(marc, chat))
+        speech, answer = is_responding_to_chat(transcript, i)
+        if speech == None:
+            continue
+
+        for comment in chat:
+
+            potential_question = comment['text']
+            answer_time = comment['time']
+
+            # if the lecturer repeats or paraphrases the question
+            if (fuzz.ratio(speech, potential_question) > 50):
+                if potential_question not in qna:
+                    qna[potential_question] = {
+                        "question": potential_question,
+                        "answer": " ".join(answer),
+                        "time": answer_time,
+                        "moderator_response": False,
+                    }
+                    break
+
+    qna = list(qna.values())
+    return qna
diff --git a/match_chat.py b/match_chat.py
@@ -1,14 +1,13 @@
-""" MATCH STUDENT QUESTIONS FROM THE LIVE CHAT WITH RESPONSES FROM MODERATORS """
+# matches questions from live chat with responses from moderators 
+
 import get_chat
 import re
 from fuzzywuzzy import fuzz
 
 def get_unique_users(chat):
     """ get a dictionary of unique users from the chat """
 
-    unique_users = {
-
-    }
+    unique_users = { }
 
     for message in chat:
         unique_users[message['user']] = message['privilege']
@@ -17,6 +16,7 @@ def get_unique_users(chat):
 
 def is_question(message, unique_users, q):
     """ returns the message if it is a question and None if it is not"""
+
     line = get_tagged_user(message['text'], unique_users)[1]
 
     if '?' in line:
@@ -36,8 +36,9 @@ def is_question(message, unique_users, q):
 
 
 def find_question_expanded(message, unique_users, q):
-
-    (user_tagged, line) = get_tagged_user(message['text'], unique_users)
+    """ finding potential questions with extra checks"""
+
+    user_tagged = get_tagged_user(message['text'], unique_users)[0]
 
     # check if this message is an answer to another student's question
     if user_tagged != None and unique_users[user_tagged] == 'member':
@@ -51,12 +52,16 @@ def find_question_expanded(message, unique_users, q):
 def get_tagged_user(line, unique_users):
     """ given a chat message, return the user tagged"""
     tagged_user = None
+
     for user in unique_users:
+
         tagged_user = re.search(f"@{user}\s*", line)
+
         if tagged_user != None:
             tagged_user = tagged_user.group(0).replace("@", "").strip()
             line = line.replace(f"@{user} ", "")
             break
+
     return (tagged_user, line)
 
 def find_corresponding_question(chat, tagged_user, prev_mod_index, index, q, unique_users):
@@ -73,8 +78,10 @@ def find_corresponding_question(chat, tagged_user, prev_mod_index, index, q, uni
     # expanding the search space ft. some additional checks
     for i in range(prev_mod_index[1], prev_mod_index[0], -1):
         message = chat[i]
+
         if message['user'] == tagged_user:
             question = find_question_expanded(message, unique_users, q)
+
             if question != None:
                 return question
 
@@ -112,12 +119,14 @@ def match_chat(chat):
             else:
                 qna[question] = {
                     "question": question,
-                    "answer": answer,
+                    "answer": f'(MODERATOR) {answer}',
                     "time": message['time'],
                     "user": tagged_user,
-                    "moderator": message['user']
+                    "moderator": message['user'],
+                    "moderator_response": True
                 }
-
+
+            # updates qna search space 
             prev_mod_index = (prev_mod_index[0], index)
 
     qna = list(qna.values())

diff --git a/server.py b/server.py
@@ -51,10 +51,11 @@ def get_title():
 @APP.route("/magic", methods=["get"])
 def get_magic():
     video_id = request.args.get("id")
-
     try:
-        qna = magic(id_to_transcript(video_id), id_to_chat(video_id))
-        qna.extend(match_chat(id_to_chat_split(video_id)))
+        transcript = id_to_transcript(video_id)
+        live_chat = id_to_chat_split(video_id)
+        qna = magic(transcript, live_chat)
+        qna.extend(match_chat(live_chat))
     except TranscriptsDisabled:
         raise APITranscriptError("Transcripts have been disabled on this video.")
     except CouldNotRetrieveTranscript as e: