diff --git a/yt_dlp/extractor/docubay.py b/yt_dlp/extractor/docubay.py index a829f909652d..6314c232119d 100755 --- a/yt_dlp/extractor/docubay.py +++ b/yt_dlp/extractor/docubay.py @@ -1,3 +1,6 @@ +import re + +from ..utils import try_get from .common import InfoExtractor @@ -5,26 +8,60 @@ class DocubayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?docubay\.com/[^#/?$]+-(?P\d+)' _TESTS = [{ - 'url': 'https://www.docubay.com/20000-cables-under-the-sea-2574', + 'url': 'https://www.docubay.com/inside-ikea-3242', 'info_dict': { - 'id': '2574', + 'id': '3242', 'ext': 'mp4', - 'description': 'md5:aed927dcff70441282d7864e9b9a8d20', - 'thumbnail': 'https://st1.docubay.com/featured-images/1606222830-20k-cables-under-the-sea-1024x576-xoriginal.jpg', - 'title': '20,000 Cables under the Sea', + 'description': 'md5:89b599ebdd695811d4f76a2fc5ee5718', + 'thumbnail': 'https://st1.docubay.com/featured-images/1617969319-inside-ikea-1024x576-banner-xoriginal.jpg', + 'title': 'Inside IKEA', + 'upload_date': '20180101', + } + }, { + 'url': 'https://www.docubay.com/the-female-battalion-3430', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '3430', + 'title': 'The Female Battalion', } + }] _API_URL = "https://www.docubay.com/ajaxplayer" + def handle_series(self, webpage, s_id): + id = self._search_regex(r'data-currcontentid\s*=\s*"\d+"\s*data-id\s*=\s*"(\d+)"', webpage, 'id', None, False) + current_id = self._search_regex(r'data-currcontentid\s*=\s*"(\d+)"', webpage, 'current_id', None, False) + series_dump = self._download_webpage( + "https://www.docubay.com/season-api", + video_id=s_id, + headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}, + data=f'id={id}¤t_id={current_id}&curr_play_status='.encode() + ) + return self.playlist_result([ + self.url_result(v_url, ie=DocubayIE.ie_key()) + for v_url in set(re.findall(r'(https?://(?:www\.)?docubay\.com/[^#/?$]+-\d+)', series_dump)) or [] + ], + playlist_id=s_id, + playlist_title=self._search_regex(r'class\s*=\s*"shows-title"[^>]*>([^<]+)', webpage, 'title', None, False)) + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).replace(' | Online at DocuBay', '').replace('Watch ', '').replace(' - ', '') + if not title: + title = self._search_regex(r'class\s*=\s*"shows-title"[^>]*>([^<]+)', webpage, 'title', None, False) + data_json = self._parse_json(self._download_json( self._API_URL, video_id, headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}, data=f'cid={video_id}&action=st&type=video'.encode()), video_id) + if not try_get(data_json, lambda x: x['url']['video_url'], None): + return self.handle_series(webpage, video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(data_json['url']['video_url'], video_id) subs = {} for sub in data_json.get('subtitles') or []: @@ -35,11 +72,6 @@ def _real_extract(self, url): 'url': sub_url, }) - webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'class\s*=\s*"shows-title"[^>]*>([^<]+)', webpage, 'title', None, False) - if not title: - title = self._og_search_title(webpage).replace(' | Online at DocuBay', '').replace('Watch ', '') - upload_date = self._search_regex(r'class\s*=\s*"show-duration"[^>]*>\s*(\d+)', webpage, 'upload date', None, False) if upload_date: upload_date += '0101' @@ -54,5 +86,4 @@ def _real_extract(self, url): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles - }