-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
266 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,133 @@ | ||
defmodule CF.Videos.CaptionsFetcherYoutube do | ||
@moduledoc """ | ||
A captions fetcher for YouTube. | ||
Based upon https://github.com/Valian/youtube-captions, but adapted with Httpoison. | ||
""" | ||
|
||
@behaviour CF.Videos.CaptionsFetcher | ||
|
||
require Logger | ||
|
||
@impl true | ||
def fetch(%{youtube_id: youtube_id, language: language}) do | ||
with {:ok, content} <- fetch_captions_content(youtube_id, language) do | ||
captions = %DB.Schema.VideoCaption{ | ||
content: content, | ||
format: "xml" | ||
} | ||
|
||
{:ok, captions} | ||
with {:ok, data} <- fetch_youtube_data(youtube_id), | ||
{:ok, caption_tracks} <- parse_caption_tracks(data), | ||
{:ok, transcript_url} <- find_transcript_url(caption_tracks, language), | ||
{:ok, transcript_data} <- fetch_transcript(transcript_url) do | ||
{:ok, | ||
%{ | ||
raw: transcript_data, | ||
parsed: process_transcript(transcript_data), | ||
format: "xml" | ||
}} | ||
end | ||
end | ||
|
||
defp fetch_captions_content(video_id, locale) do | ||
case HTTPoison.get("http://video.google.com/timedtext?lang=#{locale}&v=#{video_id}") do | ||
{:ok, %HTTPoison.Response{status_code: 200, body: ""}} -> | ||
{:error, :not_found} | ||
defp fetch_youtube_data(video_id) do | ||
url = "https://www.youtube.com/watch?v=#{video_id}" | ||
|
||
{:ok, %HTTPoison.Response{status_code: 200, body: body}} -> | ||
case HTTPoison.get(url, []) do | ||
{:ok, %HTTPoison.Response{body: body}} -> | ||
{:ok, body} | ||
|
||
{:ok, %HTTPoison.Response{status_code: 404}} -> | ||
{:error, :not_found} | ||
{:error, %HTTPoison.Error{reason: reason}} -> | ||
{:error, "Failed to fetch YouTube video #{url}: #{inspect(reason)}"} | ||
end | ||
end | ||
|
||
{:ok, %HTTPoison.Response{status_code: _}} -> | ||
{:error, :unknown} | ||
defp parse_caption_tracks(data) do | ||
captions_regex = ~r/"captionTracks":(?<data>\[.*?\])/ | ||
case Regex.named_captures(captions_regex, data) do | ||
%{"data" => data} -> {:ok, Jason.decode!(data)} | ||
_ -> {:error, "Could not find captions for video"} | ||
end | ||
end | ||
|
||
defp find_transcript_url(caption_tracks, lang) do | ||
case Enum.find(caption_tracks, &Regex.match?(~r".#{lang}", &1["vssId"])) do | ||
nil -> | ||
{:error, "Unable to find transcript for language #{lang}"} | ||
|
||
%{"baseUrl" => base_url} -> | ||
{:ok, base_url} | ||
|
||
_data -> | ||
{:error, "Unable to find transcript URL for language #{lang}"} | ||
end | ||
end | ||
|
||
defp fetch_transcript(base_url) do | ||
case HTTPoison.get(base_url, []) do | ||
{:ok, %HTTPoison.Response{body: body}} -> | ||
{:ok, body} | ||
|
||
{:error, %HTTPoison.Error{reason: reason}} -> | ||
{:error, reason} | ||
{:error, "Failed to fetch transcript: #{inspect(reason)}"} | ||
end | ||
end | ||
|
||
defp process_transcript(transcript) do | ||
transcript | ||
|> String.replace(~r/^<\?xml version="1.0" encoding="utf-8"\?><transcript>/, "") | ||
|> String.replace("</transcript>", "") | ||
|> String.split("</text>") | ||
|> Enum.filter(&(String.trim(&1) != "")) | ||
|> Enum.map(&process_line/1) | ||
end | ||
|
||
defp process_line(line) do | ||
%{"start" => start} = Regex.named_captures(~r/start="(?<start>[\d.]+)"/, line) | ||
%{"dur" => dur} = Regex.named_captures(~r/dur="(?<dur>[\d.]+)"/, line) | ||
|
||
text = | ||
line | ||
|> String.replace("&", "&") | ||
|> String.replace(~r/<text.+>/, "") | ||
|> String.replace(~r"</?[^>]+(>|$)", "") | ||
|> HtmlEntities.decode() | ||
|> String.trim() | ||
|
||
%{start: parse_float(start), duration: parse_float(dur), text: text} | ||
end | ||
|
||
defp parse_float(val) do | ||
{num, _} = Float.parse(val) | ||
num | ||
end | ||
|
||
# defp fetch_captions_content_with_official_api(video_id, locale) do | ||
# # TODO: Continue dev here. See https://www.perplexity.ai/search/Can-you-show-jioyCtw.S4yrL8mlIBdqGg | ||
# {:ok, token} = Goth.Token.for_scope("https://www.googleapis.com/auth/youtube.force-ssl") | ||
# conn = YouTubeConnection.new(token.token) | ||
|
||
# {:ok, captions} = GoogleApi.YouTube.V3.Api.Captions.youtube_captions_list(conn, ["snippet"], video_id, []) | ||
# # { | ||
# # "kind": "youtube#captionListResponse", | ||
# # "etag": "kMTAKpyU_VGu7GxgEnxXHqcuEXM", | ||
# # "items": [ | ||
# # { | ||
# # "kind": "youtube#caption", | ||
# # "etag": "tWo68CIcRRFZA0oXPt8HGxCYia4", | ||
# # "id": "AUieDaZJxYug0L5YNAw_31GbXz73b0CPXCDFlsPNSNe7KQvuv1g", | ||
# # "snippet": { | ||
# # "videoId": "v2IoEhuho2k", | ||
# # "lastUpdated": "2024-06-16T18:45:12.56697Z", | ||
# # "trackKind": "asr", | ||
# # "language": "fr", | ||
# # "name": "", | ||
# # "audioTrackType": "unknown", | ||
# # "isCC": false, | ||
# # "isLarge": false, | ||
# # "isEasyReader": false, | ||
# # "isDraft": false, | ||
# # "isAutoSynced": false, | ||
# # "status": "serving" | ||
# # } | ||
# # } | ||
# # ] | ||
# # } | ||
|
||
# caption_id = List.first(captions.items).id # TODO inspect to pick the right caption | ||
# {:ok, caption} = GoogleApi.YouTube.V3.Api.Captions.youtube_captions_download(conn, caption_id, []) | ||
# end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
defmodule CF.Jobs.DownloadCaptions do | ||
@behaviour CF.Jobs.Job | ||
|
||
require Logger | ||
import Ecto.Query | ||
import ScoutApm.Tracing | ||
|
||
alias DB.Repo | ||
alias DB.Schema.UserAction | ||
alias DB.Schema.Video | ||
alias DB.Schema.VideoCaption | ||
alias DB.Schema.UsersActionsReport | ||
|
||
alias CF.Jobs.ReportManager | ||
|
||
@name :download_captions | ||
@analyser_id UsersActionsReport.analyser_id(@name) | ||
|
||
# --- Client API --- | ||
|
||
def name, do: @name | ||
|
||
def start_link() do | ||
GenServer.start_link(__MODULE__, :ok, name: __MODULE__) | ||
end | ||
|
||
def init(args) do | ||
{:ok, args} | ||
end | ||
|
||
# 2 minutes | ||
@timeout 120_000 | ||
def update() do | ||
GenServer.call(__MODULE__, :download_captions, @timeout) | ||
end | ||
|
||
# --- Server callbacks --- | ||
@transaction_opts [type: "background", name: "download_captions"] | ||
def handle_call(:download_captions, _from, _state) do | ||
get_videos() | ||
|> Enum.map(fn video -> | ||
Logger.info("Downloading captions for video #{video.id}") | ||
CF.Videos.download_captions(video) | ||
end) | ||
|
||
{:reply, :ok, :ok} | ||
end | ||
|
||
# Get all videos that need new captions. We fetch new captions: | ||
# - For any videos that doesn't have any captions yet | ||
# - For videos whose captions haven't been updated in the last 30 days | ||
defp get_videos() do | ||
Repo.all( | ||
from(v in Video, | ||
limit: 15, | ||
left_join: captions in VideoCaption, | ||
on: captions.video_id == v.id, | ||
where: | ||
is_nil(captions.id) or | ||
captions.inserted_at < ^DateTime.add(DateTime.utc_now(), -30 * 24 * 60 * 60, :second), | ||
group_by: v.id, | ||
order_by: [desc: v.inserted_at] | ||
) | ||
) | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.