-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
faster matching by fetching transcript once + better lecture matching
- Loading branch information
Showing
6 changed files
with
73 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,9 @@ | ||
from youtube_transcript_api import YouTubeTranscriptApi | ||
from youtube_transcript_api.formatters import JSONFormatter | ||
|
||
video_id ='Vi231_PujYI' | ||
video_id = '4WBbrxZguqk' # lecture 7 21T1 | ||
|
||
def id_to_transcript(video_id: str) -> list: | ||
transcript = YouTubeTranscriptApi.get_transcript(video_id) | ||
# for message in transcript: | ||
# print(message) | ||
text = [message['text'] for message in transcript] | ||
""" given a YouTube video ID return a list of dictionaries | ||
containing the transcript text and the timestamp for the speech """ | ||
|
||
return text | ||
|
||
if __name__ == "__main__": | ||
text = id_to_transcript(video_id) | ||
print(text) | ||
transcript = YouTubeTranscriptApi.get_transcript(video_id) | ||
|
||
return [message['text'] for message in transcript] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,60 +1,48 @@ | ||
# lecture repeats the question | ||
# matches questions from live chat with responses from lecturer/speaker | ||
|
||
from fuzzywuzzy import fuzz | ||
import get_transcript | ||
import get_chat | ||
|
||
def magic(transcript, chat): | ||
keywords = ["ask", "questions", "ask", "question"] | ||
qna = [] | ||
def is_responding_to_chat(transcript_text, i): | ||
""" determines if speech from the transcript is in response to the chat""" | ||
|
||
for i in range(len(transcript) - 20): | ||
line = transcript[i] | ||
line_set = set(line.split(" ")) | ||
keywords_set = set(keywords) | ||
|
||
if not keywords_set.isdisjoint(line_set): | ||
question = transcript[i:i+3] | ||
answer = transcript[i+3:i+20] | ||
|
||
# print(f'Question: {" ".join(question)}\n') | ||
# print(f'Answer: {" ".join(answer)}\n') | ||
|
||
for comment in chat: | ||
time = comment.split(" | ")[0] | ||
potential_question = comment.split(" | ")[1] | ||
if (fuzz.ratio(question, potential_question) > 48): | ||
print(f'{potential_question}') | ||
print(f'Question: {" ".join(question)}') | ||
print(f'Answer: {" ".join(answer)}\n') | ||
qna.append( | ||
{ | ||
"question": " ".join(question), | ||
"answer": " ".join(answer), | ||
"time": time | ||
} | ||
) | ||
|
||
return qna | ||
|
||
if __name__ == '__main__': | ||
with open("transcript.txt") as f: | ||
transcript = [line.strip() for line in f.readlines()] | ||
|
||
with open("chat.txt") as g: | ||
chat = [line.strip() for line in g.readlines()] | ||
|
||
videoId = "4WBbrxZguqk" | ||
transcript = get_transcript.id_to_transcript(videoId) | ||
chat = get_chat.id_to_chat(videoId) | ||
keywords_set = {"ask", "asking", "asks", "asked", "question", "questions", | ||
"chat", "chats", "messages"} | ||
|
||
transcript_line = transcript_text[i] | ||
transcript_line_set = set(transcript_line.split(" ")) | ||
|
||
print(magic(transcript,chat)) | ||
if not keywords_set.isdisjoint(transcript_line_set): | ||
speech = transcript_text[i:i+3] | ||
answer = transcript_text[i+3:i+20] | ||
return (speech, answer) | ||
|
||
return (None, None) | ||
|
||
marc = "what about alphabetical ordering of matt and mark" | ||
chat = "What about alphabetical ordering of matt and marc" | ||
# print(fuzz.ratio(marc, chat)) | ||
def magic(transcript, chat): | ||
""" matches questions from live chats with answers from lecture video""" | ||
qna = {} | ||
|
||
chat2 = "all of the students will do final exam at the same time?" | ||
for i in range(len(transcript) - 20): | ||
|
||
marc = "is it just returning one zero or negative one or does it return the magnitude of the difference" | ||
chat = "Is it just returning 1/0/-1 or does it return the magnitude of the diffference?" | ||
# print(fuzz.ratio(marc, chat)) | ||
speech, answer = is_responding_to_chat(transcript, i) | ||
if speech == None: | ||
continue | ||
|
||
for comment in chat: | ||
|
||
potential_question = comment['text'] | ||
answer_time = comment['time'] | ||
|
||
# if the lecturer repeats or paraphrases the question | ||
if (fuzz.ratio(speech, potential_question) > 50): | ||
if potential_question not in qna: | ||
qna[potential_question] = { | ||
"question": potential_question, | ||
"answer": " ".join(answer), | ||
"time": answer_time, | ||
"moderator_response": False, | ||
} | ||
break | ||
|
||
qna = list(qna.values()) | ||
return qna |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters