Skip to content

Commit

Permalink
faster matching by fetching transcript once + better lecture matching
Browse files Browse the repository at this point in the history
  • Loading branch information
adaluong committed Sep 12, 2021
1 parent f2d2522 commit eab32c1
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 83 deletions.
8 changes: 4 additions & 4 deletions get_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ def id_to_chat(videoId: str) -> list:
return chat_list

def id_to_chat_split(videoId: str) -> list:

""" return a list of dictionaries consisting of the chat message, user,
timestamp, and moderator status"""

chat = ChatDownloader().get_chat(f'https://www.youtube.com/watch?v={videoId}')
chat_list = []
for message in chat:
msg = {

}

msg = { }

msg['time'] = message['time_text']
msg['text'] = message['message']
Expand Down
18 changes: 5 additions & 13 deletions get_transcript.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter

video_id ='Vi231_PujYI'
video_id = '4WBbrxZguqk' # lecture 7 21T1

def id_to_transcript(video_id: str) -> list:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# for message in transcript:
# print(message)
text = [message['text'] for message in transcript]
""" given a YouTube video ID return a list of dictionaries
containing the transcript text and the timestamp for the speech """

return text

if __name__ == "__main__":
text = id_to_transcript(video_id)
print(text)
transcript = YouTubeTranscriptApi.get_transcript(video_id)

return [message['text'] for message in transcript]
2 changes: 1 addition & 1 deletion get_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from errors import CouldNotGetName

def id_to_name(video_id) -> str:
""" get the title of a YouTube video given its video id """
vid = Video.getInfo(video_id, mode=ResultMode.json)

try:
vid = json.loads(vid)
except:
Expand Down
94 changes: 41 additions & 53 deletions magic.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,48 @@
# lecture repeats the question
# matches questions from live chat with responses from lecturer/speaker

from fuzzywuzzy import fuzz
import get_transcript
import get_chat

def magic(transcript, chat):
keywords = ["ask", "questions", "ask", "question"]
qna = []
def is_responding_to_chat(transcript_text, i):
""" determines if speech from the transcript is in response to the chat"""

for i in range(len(transcript) - 20):
line = transcript[i]
line_set = set(line.split(" "))
keywords_set = set(keywords)

if not keywords_set.isdisjoint(line_set):
question = transcript[i:i+3]
answer = transcript[i+3:i+20]

# print(f'Question: {" ".join(question)}\n')
# print(f'Answer: {" ".join(answer)}\n')

for comment in chat:
time = comment.split(" | ")[0]
potential_question = comment.split(" | ")[1]
if (fuzz.ratio(question, potential_question) > 48):
print(f'{potential_question}')
print(f'Question: {" ".join(question)}')
print(f'Answer: {" ".join(answer)}\n')
qna.append(
{
"question": " ".join(question),
"answer": " ".join(answer),
"time": time
}
)

return qna

if __name__ == '__main__':
with open("transcript.txt") as f:
transcript = [line.strip() for line in f.readlines()]

with open("chat.txt") as g:
chat = [line.strip() for line in g.readlines()]

videoId = "4WBbrxZguqk"
transcript = get_transcript.id_to_transcript(videoId)
chat = get_chat.id_to_chat(videoId)
keywords_set = {"ask", "asking", "asks", "asked", "question", "questions",
"chat", "chats", "messages"}

transcript_line = transcript_text[i]
transcript_line_set = set(transcript_line.split(" "))

print(magic(transcript,chat))
if not keywords_set.isdisjoint(transcript_line_set):
speech = transcript_text[i:i+3]
answer = transcript_text[i+3:i+20]
return (speech, answer)

return (None, None)

marc = "what about alphabetical ordering of matt and mark"
chat = "What about alphabetical ordering of matt and marc"
# print(fuzz.ratio(marc, chat))
def magic(transcript, chat):
""" matches questions from live chats with answers from lecture video"""
qna = {}

chat2 = "all of the students will do final exam at the same time?"
for i in range(len(transcript) - 20):

marc = "is it just returning one zero or negative one or does it return the magnitude of the difference"
chat = "Is it just returning 1/0/-1 or does it return the magnitude of the diffference?"
# print(fuzz.ratio(marc, chat))
speech, answer = is_responding_to_chat(transcript, i)
if speech == None:
continue

for comment in chat:

potential_question = comment['text']
answer_time = comment['time']

# if the lecturer repeats or paraphrases the question
if (fuzz.ratio(speech, potential_question) > 50):
if potential_question not in qna:
qna[potential_question] = {
"question": potential_question,
"answer": " ".join(answer),
"time": answer_time,
"moderator_response": False,
}
break

qna = list(qna.values())
return qna
27 changes: 18 additions & 9 deletions match_chat.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
""" MATCH STUDENT QUESTIONS FROM THE LIVE CHAT WITH RESPONSES FROM MODERATORS """
# matches questions from live chat with responses from moderators

import get_chat
import re
from fuzzywuzzy import fuzz

def get_unique_users(chat):
""" get a dictionary of unique users from the chat """

unique_users = {

}
unique_users = { }

for message in chat:
unique_users[message['user']] = message['privilege']
Expand All @@ -17,6 +16,7 @@ def get_unique_users(chat):

def is_question(message, unique_users, q):
""" returns the message if it is a question and None if it is not"""

line = get_tagged_user(message['text'], unique_users)[1]

if '?' in line:
Expand All @@ -36,8 +36,9 @@ def is_question(message, unique_users, q):


def find_question_expanded(message, unique_users, q):

(user_tagged, line) = get_tagged_user(message['text'], unique_users)
""" finding potential questions with extra checks"""

user_tagged = get_tagged_user(message['text'], unique_users)[0]

# check if this message is an answer to another student's question
if user_tagged != None and unique_users[user_tagged] == 'member':
Expand All @@ -51,12 +52,16 @@ def find_question_expanded(message, unique_users, q):
def get_tagged_user(line, unique_users):
""" given a chat message, return the user tagged"""
tagged_user = None

for user in unique_users:

tagged_user = re.search(f"@{user}\s*", line)

if tagged_user != None:
tagged_user = tagged_user.group(0).replace("@", "").strip()
line = line.replace(f"@{user} ", "")
break

return (tagged_user, line)

def find_corresponding_question(chat, tagged_user, prev_mod_index, index, q, unique_users):
Expand All @@ -73,8 +78,10 @@ def find_corresponding_question(chat, tagged_user, prev_mod_index, index, q, uni
# expanding the search space ft. some additional checks
for i in range(prev_mod_index[1], prev_mod_index[0], -1):
message = chat[i]

if message['user'] == tagged_user:
question = find_question_expanded(message, unique_users, q)

if question != None:
return question

Expand Down Expand Up @@ -112,12 +119,14 @@ def match_chat(chat):
else:
qna[question] = {
"question": question,
"answer": answer,
"answer": f'(MODERATOR) {answer}',
"time": message['time'],
"user": tagged_user,
"moderator": message['user']
"moderator": message['user'],
"moderator_response": True
}


# updates qna search space
prev_mod_index = (prev_mod_index[0], index)

qna = list(qna.values())
Expand Down
7 changes: 4 additions & 3 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def get_title():
@APP.route("/magic", methods=["get"])
def get_magic():
video_id = request.args.get("id")

try:
qna = magic(id_to_transcript(video_id), id_to_chat(video_id))
qna.extend(match_chat(id_to_chat_split(video_id)))
transcript = id_to_transcript(video_id)
live_chat = id_to_chat_split(video_id)
qna = magic(transcript, live_chat)
qna.extend(match_chat(live_chat))
except TranscriptsDisabled:
raise APITranscriptError("Transcripts have been disabled on this video.")
except CouldNotRetrieveTranscript as e:
Expand Down

0 comments on commit eab32c1

Please sign in to comment.