Skip to content

Commit

Permalink
Fix: instagram
Browse files Browse the repository at this point in the history
  • Loading branch information
eight04 committed Mar 25, 2024
1 parent b4c4ccd commit 81b2861
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 39 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ build
dist

.venv
test.*
test*.*
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -392,14 +392,14 @@ Changelog

- 2024.3.25

- Change: set referer and origin header in analyzer.
- Change: wait 3 seconds after analyze error.
- Fix: skip episodes without images in kemono.
- Add: .clip to valid file extensions.
- Add: ability to write partial data to disk.
- Add: browser and browser_profile settings which are used to extract cookies.
- Change: skip episodes without images in kemono.
- Add: after_request, session_key hooks.
- Add: session_manager for better control of api sessions.
- Change: set referer and origin header in analyzer.
- Change: wait 3 seconds after analyze error.

- 2024.1.4

Expand Down
2 changes: 1 addition & 1 deletion comiccrawler/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def load(self):

self.config['DEFAULT']["savepath"] = normpath(self.config['DEFAULT']["savepath"])

if self.config["default"]["browser"]:
if self.config["DEFAULT"]["browser"]:
jar = yt_dlp.cookies.extract_cookies_from_browser(
self.config["DEFAULT"]["browser"], self.config["DEFAULT"]["browser_profile"])
session_manager.set_default_cookie(jar)
Expand Down
5 changes: 4 additions & 1 deletion comiccrawler/grabber.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ def iter_content(r):
yield from r.raw.read_chunked(decode_content=True)
else:
while not is_fp_closed(r.raw._fp) or len(r.raw._decoded_buffer) > 0: # pylint: disable=protected-access
yield r.raw.read1(decode_content=True)
b = r.raw.read1(decode_content=True)
yield b
if not b:
sleep(0.1)

def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs):
"""Grab the image. Return ImgResult"""
Expand Down
10 changes: 1 addition & 9 deletions comiccrawler/mods/fantia.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from ..episode import Episode
from ..grabber import grabber, grabhtml
from ..url import urljoin
from ..util import clean_tags, extract_curl
from ..session_manager import session_manager
from ..util import clean_tags

domain = ["fantia.jp"]
name = "fantia"
Expand All @@ -27,14 +27,6 @@ def get_title(html, url):
name = re.search('<h1 class="fanclub-name">(.+?)</h1', html).group(1)
return f"[fantia] {clean_tags(name)}"

def curl_to_kwargs(curl):
kwargs = {}
_url, header, cookie = extract_curl(curl)
# NOTE: method-level header/cookies won't be stored into session
kwargs["headers"] = header
kwargs["cookies"] = cookie
return kwargs

def get_episodes(html, url):
result = []
for match in re.finditer(r'<a[^>]+href="(/posts/(\d+))"[^>]+title="([^"]+)', html):
Expand Down
55 changes: 32 additions & 23 deletions comiccrawler/mods/instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,44 @@
import re
import json
from html import unescape
from urllib.parse import urlparse

from ..core import Episode
from ..error import is_http, SkipEpisodeError, SkipPageError
from ..url import update_qs
from ..grabber import grabber
from ..util import extract_curl
from ..session_manager import session_manager

domain = ["www.instagram.com"]
name = "Instagram"
noepfolder = True

cache_next_page = {}
config = {
"curl": "",
"api_curl": ""
"curl": r"""curl 'https://www.instagram.com/p/foo/' --compressed -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: zh-TW,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: none' -H 'Sec-Fetch-User: ?1' -H 'Priority: u=4' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache'""",
"curl_api": ""
}
autocurl = True

def session_key(url):
r = urlparse(url)
if "api/v1" in r.path:
return (r.scheme, r.netloc, "api")

def init_api_session(html):
session = session_manager.get("https://www.instagram.com/api/v1/feed")
if "X-CSRFToken" in session.headers:
return
token = session.cookies.get("csrftoken", domain=".instagram.com") or session.cookies.get("csrftoken")
app_id = re.search(r'"APP_ID":"([^"]+)', html).group(1)
session.headers.update({
"X-CSRFToken": token,
"X-IG-App-ID": app_id,
})

def get_title(html, url):
title = re.search("<title>([^<]+)", html).group(1)
return "[instagram] {}".format(unescape(title).strip())

def grabhandler(grab_method, url, **kwargs):
if "api/v1" in url:
return grab_json(url, **kwargs)

def grab_json(url, **kwargs):
_url, header, cookie = extract_curl(config["api_curl"])
# NOTE: method-level header/cookies won't be stored into session
kwargs["headers"] = header
kwargs["cookies"] = cookie
return grabber(url, **kwargs).json()

def get_episodes_from_data(data):
user = data["user"]
timeline = user["edge_owner_to_timeline_media"]
Expand All @@ -56,12 +61,13 @@ def get_episodes(html, url):
if match := re.match(r"https://www\.instagram\.com/([^/]+)/", url):
username = match.group(1)
if username != "api":
init_api_session(html)
next_url = f"https://www.instagram.com/api/v1/feed/user/{username}/username/?count=12"
cache_next_page[url] = next_url
raise SkipPageError

if "api/v1/feed" in url:
body = html
body = json.loads(html)
result = []
for item in body["items"]:
result.append(Episode(
Expand All @@ -78,12 +84,12 @@ def get_episodes(html, url):
raise ValueError("unknown URL: {}".format(url))

def get_init_data(html, page):
shared_data = re.search("window\._sharedData = ([\s\S]+?);</script", html).group(1)
shared_data = re.search(r"window\._sharedData = ([\s\S]+?);</script", html).group(1)
shared_data = json.loads(shared_data)
return shared_data["entry_data"][page][0]["graphql"]

def get_extra_data(html):
text = re.search("window\.__additionalDataLoaded\('[^']+',(.*?)\);</script>", html).group(1)
text = re.search(r"window\.__additionalDataLoaded\('[^']+',(.*?)\);</script>", html).group(1)
return json.loads(text)

def find_media(media):
Expand All @@ -92,7 +98,7 @@ def find_media(media):
return max(media["image_versions2"]["candidates"], key=lambda i: i["height"])["url"]

def extract_json(html, filter=None):
for match in re.finditer(r'<script type="application/json"[^>]+>([^<]+)</script>', html):
for match in re.finditer(r'<script type="application/json"[^>]+>(.*?)</script>', html):
text = match.group(1)
if filter and not filter(text):
continue
Expand All @@ -112,19 +118,22 @@ def extract_json_value(data, key):
yield from extract_json_value(item, key)

def get_images(html, url):
pid = re.search(r"https://www\.instagram\.com/p/([^/]+)/", url).group(1)
result = []
key = "xdt_api__v1__media__shortcode__web_info"
# breakpoint()
from pathlib import Path
Path("test.html").write_text(html)
for data in extract_json(html, filter=lambda s: key in s):
for web_info in extract_json_value(data, key):
for item in web_info["items"]:
if item.get("carousel_media", None):
result += [find_media(m) for m in item["carousel_media"]]
else:
result.append(find_media(item))
# FIXME: need a way to detect 404 page
if not result:
print("no image found")
raise SkipEpisodeError(always=True)
# FIXME: there is no way to distinguash between request error and 404
if "PolarisErrorRoot" in html:
raise SkipEpisodeError(always=False)
return result

def get_next_page(html, url):
Expand Down
2 changes: 1 addition & 1 deletion comiccrawler/session_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .util import extract_curl

default_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0",
"Accept-Language": "zh-tw,zh;q=0.8,en-us;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate"
}
Expand Down

0 comments on commit 81b2861

Please sign in to comment.