Skip to content

Commit

Permalink
Fix all Youtube Feeds (ytsubs, ythistory, ytrec)
Browse files Browse the repository at this point in the history
  • Loading branch information
pukkandan committed Nov 21, 2020
1 parent 73b2906 commit de6b9b9
Showing 1 changed file with 121 additions and 78 deletions.
199 changes: 121 additions & 78 deletions youtube_dlc/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -2839,6 +2839,26 @@ def _playlist_entries(self, video_list_renderer):
continue
yield self._extract_video(renderer)

def _itemSection_entries(self, item_sect_renderer):
for content in item_sect_renderer['contents']:
if not isinstance(content, dict):
continue
renderer = content.get('videoRenderer', {})
if not isinstance(renderer, dict):
continue
video_id = renderer.get('videoId')
if not video_id:
continue
yield self._extract_video(renderer)

def _rich_entries(self, rich_grid_renderer):
renderer = try_get(
rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict)
video_id = renderer.get('videoId')
if not video_id:
return
yield self._extract_video(renderer)

def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
if video_id:
Expand Down Expand Up @@ -2930,49 +2950,67 @@ def _extract_continuation(cls, renderer):
}

def _entries(self, tab, identity_token):
continuation = None
slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
for slr_content in slr_contents:
if not isinstance(slr_content, dict):
continue
is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
if not is_renderer:
continue
isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
for isr_content in isr_contents:
if not isinstance(isr_content, dict):
continue
renderer = isr_content.get('playlistVideoListRenderer')
if renderer:
for entry in self._playlist_entries(renderer):
yield entry
continuation = self._extract_continuation(renderer)
continue
renderer = isr_content.get('gridRenderer')
if renderer:
for entry in self._grid_entries(renderer):
yield entry
continuation = self._extract_continuation(renderer)
continue
renderer = isr_content.get('shelfRenderer')
if renderer:
for entry in self._shelf_entries(renderer):
yield entry

def extract_entries(parent_renderer):
slr_contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
for slr_content in slr_contents:
if not isinstance(slr_content, dict):
continue
renderer = isr_content.get('backstagePostThreadRenderer')
if renderer:
for entry in self._post_thread_entries(renderer):
yield entry
continuation = self._extract_continuation(renderer)
is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
if not is_renderer:
renderer = slr_content.get('richItemRenderer')
if renderer:
for entry in self._rich_entries(renderer):
yield entry
continuation_list[0] = self._extract_continuation(parent_renderer)
continue
renderer = isr_content.get('videoRenderer')
if renderer:
entry = self._video_entry(renderer)
if entry:
yield entry
isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
for isr_content in isr_contents:
if not isinstance(isr_content, dict):
continue
renderer = isr_content.get('playlistVideoListRenderer')
if renderer:
for entry in self._playlist_entries(renderer):
yield entry
continuation_list[0] = self._extract_continuation(renderer)
continue
renderer = isr_content.get('gridRenderer')
if renderer:
for entry in self._grid_entries(renderer):
yield entry
continuation_list[0] = self._extract_continuation(renderer)
continue
renderer = isr_content.get('shelfRenderer')
if renderer:
for entry in self._shelf_entries(renderer):
yield entry
continuation_list[0] = self._extract_continuation(parent_renderer)
continue
renderer = isr_content.get('backstagePostThreadRenderer')
if renderer:
for entry in self._post_thread_entries(renderer):
yield entry
continuation_list[0] = self._extract_continuation(renderer)
continue
renderer = isr_content.get('videoRenderer')
if renderer:
entry = self._video_entry(renderer)
if entry:
yield entry
if not continuation_list[0]:
continuation_list[0] = self._extract_continuation(is_renderer)
if not continuation_list[0]:
continuation_list[0] = self._extract_continuation(parent_renderer)

continuation_list = [None] # Python 2 doesnot support nonlocal
parent_renderer = (
try_get(tab, lambda x: x['sectionListRenderer'], dict) or
try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
if parent_renderer:
for entry in extract_entries(parent_renderer):
yield entry

if not continuation:
continuation = self._extract_continuation(is_renderer)
continuation = continuation_list[0]

headers = {
'x-youtube-client-name': '1',
Expand All @@ -2984,6 +3022,8 @@ def _entries(self, tab, identity_token):
for page_num in itertools.count(1):
if not continuation:
break
if hasattr(self,'_MAX_PAGES') and page_num > self._MAX_PAGES:
break
browse = self._download_json(
'https://www.youtube.com/browse_ajax', None,
'Downloading page %d' % page_num,
Expand Down Expand Up @@ -3015,6 +3055,13 @@ def _entries(self, tab, identity_token):
yield entry
continuation = self._extract_continuation(continuation_renderer)
continue
continuation_renderer = continuation_contents.get('sectionListContinuation')
if continuation_renderer:
continuation_list = [None]
for entry in extract_entries(continuation_renderer):
yield entry
continuation = continuation_list[0]
continue

continuation_items = try_get(
response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
Expand All @@ -3029,7 +3076,12 @@ def _entries(self, tab, identity_token):
yield entry
continuation = self._extract_continuation(video_list_renderer)
continue

renderer = continuation_item.get('itemSectionRenderer')
if renderer:
for entry in self._itemSection_entries(renderer):
yield entry
continuation = self._extract_continuation({'contents': continuation_items})
continue
break

@staticmethod
Expand Down Expand Up @@ -3422,12 +3474,13 @@ def _real_extract(self, url):
return IE._get_n_results(query, self._MAX_RESULTS)


class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeTabIE):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
_MAX_PAGES = 5

@property
def IE_NAME(self):
Expand All @@ -3436,44 +3489,34 @@ def IE_NAME(self):
def _real_initialize(self):
self._login()

def _entries(self, page):
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

# 'recommended' feed has infinite 'load more' and each new portion spins
# the same videos in (sometimes) slightly different order, so we'll check
# for unicity and break when portion has no new videos
new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
if not new_ids:
break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):
yield entry

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj:
break
def _shelf_entries(self, shelf_renderer):
renderer = try_get(
shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
if not renderer:
return
for entry in self._grid_entries(renderer):
yield entry

more = self._download_json(
'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape,
headers=self._YOUTUBE_CLIENT_HEADERS)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
selected_tab = self._extract_selected_tab(tabs)
return self.playlist_result(
self._entries(selected_tab['content'], identity_token),
playlist_title=self._PLAYLIST_TITLE)

def _real_extract(self, url):
page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE)
return self.playlist_result(
self._entries(page), playlist_title=self._PLAYLIST_TITLE)
item_id = self._FEED_NAME
url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME
webpage = self._download_webpage(url, item_id)
identity_token = self._search_regex(
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None)
data = self._extract_yt_initial_data(item_id, webpage)
tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
if tabs:
return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
# Failed to recognize
raise ExtractorError('Unable to recognize feed page')


class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
Expand Down Expand Up @@ -3509,7 +3552,7 @@ def _real_extract(self, url):

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos'

Expand Down

0 comments on commit de6b9b9

Please sign in to comment.