diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d70c8f9 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +.git +.gitignore +__pycache__/ +*.py[cod] +*.pyo +.Python +.env +cookie.json +download_comments/ +data/ diff --git a/.gitignore b/.gitignore index 0bb0c76..03d2ff0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__ cookie.json .env +data/ +download_comments/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9407a3b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim AS runtime + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 + +WORKDIR /app + +RUN groupadd --system app && useradd --system --gid app --home-dir /data app + +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt + +COPY login.py main.py ./ + +RUN mkdir -p /data && chown -R app:app /app /data + +USER app +WORKDIR /data + +CMD ["python", "/app/main.py"] diff --git a/README.md b/README.md index b267f18..3ed2958 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Scrape **all parent comments** from any Instagram Reel with **automated login**, ## 📦 Requirements - Python **3.9+** +- Docker + Docker Compose, optional - Dependencies: ```bash @@ -50,7 +51,32 @@ python3 main.py ``` * Enter the Instagram Reel URL (e.g., https://www.instagram.com/reel/SHORTCODE/). * Set Max requests per second (5-7 recommended). Adjust for stability. - * On first run, provide username/password; cookie.json is created and reused until expiry. + * On first run, provide username/password and a 2FA code if prompted; `cookie.json` is created and reused until expiry. + +## 🐳 Docker Usage + +Run the scraper without installing Python dependencies on your host: + +```bash +docker compose run --rm instascrape +``` + +The container runs as a non-root user and stores runtime files in `./data`: + + * `./data/cookie.json` + * `./data/download_comments/txt/...` + * `./data/download_comments/json/...` + +Rebuild after dependency changes: + +```bash +docker compose build +``` + +If automated login cannot complete, the CLI can import an authenticated +Instagram `Cookie` header from a logged-in browser request. The header must +include `sessionid`, `csrftoken`, `mid`, and `ds_user_id`; it is then stored in +`./data/cookie.json`. ## 📁 Output * TXT: download_comments/txt/reel_comments_YYYYMMDD_HHMMSS.txt @@ -67,6 +93,7 @@ Example JSON structure: ``` --- ## 🔧 How it Works + * Login: uses `instagrapi` for current Instagram login and 2FA support, with a cookie-header fallback when automated login is blocked. * Cookie Lifecycle: cookie.json stores iat and expiry; validated on startup & during requests. * Error Resilience: retries transient errors and refreshes cookies on 401/redirect-to-login. * Progress Accuracy: uses Instagram’s comment count to calculate percent & ETA. @@ -75,7 +102,7 @@ Example JSON structure: ## 💡 Tips * Start with 5-7 RPS to minimize throttling; increase gradually. * Filenames use local time; switch to UTC by replacing datetime.now() with datetime.utcnow() in main.py. + * If login fails with `challenge_required`, open Instagram in the official app or browser, approve the login challenge, then retry. --- ## ⚠️ Disclaimer Use responsibly. Comply with Instagram’s Terms of Service. Intended for personal or permitted use only. - diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..737b0cd --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,11 @@ +services: + instascrape: + build: + context: . + image: instascrape:local + stdin_open: true + tty: true + volumes: + - ./data:/data + environment: + PYTHONUNBUFFERED: "1" diff --git a/login.py b/login.py index f50ad81..ed78c31 100644 --- a/login.py +++ b/login.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import argparse +import getpass import json import time import hmac @@ -9,13 +10,15 @@ import uuid import random import sys -from typing import Optional, Dict, Any, Tuple +from typing import Callable, Optional, Dict, Any, Tuple import requests API_URL = "https://i.instagram.com/api/{version}/" V = "v1" -USER_AGENT = "Instagram 123.0.0.0 Android (30/11; 420dpi; 1080x1920; Google; Pixel; sailfish; qcom; en_US)" +# Keep this close to a current official Android release. Instagram can block +# login attempts that look like very old app builds, especially for EU users. +USER_AGENT = "Instagram 430.0.0.19.80 Android (35/15; 420dpi; 1080x2400; Google/google; Pixel 8; shiba; shiba; en_US; 123456789)" IG_SIG_KEY = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" SIG_KEY_VERSION = "4" IG_CAPABILITIES = "3brTvw" @@ -134,17 +137,168 @@ def cookie_json_valid(d: Optional[Dict[str, Any]]) -> bool: required_ok = all(k in c and isinstance(c.get(k), str) and c.get(k) for k in ("sessionid", "csrftoken", "mid", "ds_user_id")) return bool(required_ok and isinstance(overall, int) and overall > now) +def parse_cookie_string(cookie_header: str) -> Tuple[str, str, str, str]: + parts: Dict[str, str] = {} + for item in cookie_header.split(";"): + if "=" not in item: + continue + key, value = item.split("=", 1) + parts[key.strip()] = value.strip() + + missing = [k for k in ("sessionid", "csrftoken", "mid", "ds_user_id") if not parts.get(k)] + if missing: + raise LoginError(f"Cookie string is missing required values: {', '.join(missing)}") + + return parts["sessionid"], parts["csrftoken"], parts["mid"], parts["ds_user_id"] + # ---------- Login Core ---------- class LoginError(Exception): pass -def login_instagram(username: str, password: str, timeout_prelogin: int = 10, timeout_login: int = 20) -> Tuple[str, str, str, str]: +def _cookie_tuple_from_mapping(cookies: Dict[str, Any]) -> Optional[Tuple[str, str, str, str]]: + values = {k: str(cookies.get(k) or "") for k in ("sessionid", "csrftoken", "mid", "ds_user_id")} + if all(values.values()): + return values["sessionid"], values["csrftoken"], values["mid"], values["ds_user_id"] + return None + +def _cookie_tuple_from_jar(jar: requests.cookies.RequestsCookieJar) -> Optional[Tuple[str, str, str, str]]: + sessionid = get_cookie_value(jar, "sessionid", domain="instagram.com") + csrftoken = get_cookie_value(jar, "csrftoken", domain="instagram.com") + mid = get_cookie_value(jar, "mid", domain="instagram.com") + dsuserid = get_cookie_value(jar, "ds_user_id", domain="instagram.com") + if all([sessionid, csrftoken, mid, dsuserid]): + return sessionid, csrftoken, mid, dsuserid + return None + +def _extract_cookie_tuple(session: requests.Session) -> Tuple[str, str, str, str]: + result = _cookie_tuple_from_jar(session.cookies) + if result: + return result + + raise LoginError("Login succeeded but required cookies are missing.") + +def _extract_instagrapi_cookies(client: Any) -> Tuple[str, str, str, str]: + for attr in ("private", "public"): + session = getattr(client, attr, None) + jar = getattr(session, "cookies", None) + if jar is not None: + result = _cookie_tuple_from_jar(jar) + if result: + return result + + try: + settings = client.get_settings() + except Exception: + settings = {} + + cookies = settings.get("cookies") if isinstance(settings, dict) else None + if isinstance(cookies, dict): + result = _cookie_tuple_from_mapping(cookies) + if result: + return result + + raise LoginError("instagrapi login succeeded but required cookies are missing.") + +def login_instagram_with_instagrapi( + username: str, + password: str, + two_factor_code_provider: Optional[Callable[[Dict[str, Any]], str]] = None, +) -> Tuple[str, str, str, str]: + try: + from instagrapi import Client + from instagrapi.exceptions import ChallengeRequired, TwoFactorRequired + except Exception as exc: + raise LoginError(f"instagrapi is not available: {exc}") + + client = Client() + try: + client.login(username, password) + except TwoFactorRequired: + if two_factor_code_provider is None: + raise LoginError("Two-factor authentication required on this account; provide a 2FA code to continue.") + code = two_factor_code_provider({}).strip().replace(" ", "") + if not code: + raise LoginError("Two-factor authentication code was empty.") + client.login(username, password, verification_code=code) + except ChallengeRequired as exc: + raise LoginError(f"Challenge required by Instagram; solve in-app and retry. Details: {exc}") + except Exception as exc: + raise LoginError(f"instagrapi login failed: {exc}") + + return _extract_instagrapi_cookies(client) + +def _pick_2fa_method(two_factor_info: Dict[str, Any]) -> str: + if two_factor_info.get("totp_two_factor_on"): + return "3" + if two_factor_info.get("sms_two_factor_on"): + return "1" + return "3" + +def _complete_two_factor_login( + session: requests.Session, + username: str, + two_factor_info: Dict[str, Any], + code_provider: Callable[[Dict[str, Any]], str], + timeout_login: int, +) -> Tuple[str, str, str, str]: + identifier = two_factor_info.get("two_factor_identifier") + if not identifier: + raise LoginError("Two-factor authentication required but Instagram did not return a two_factor_identifier.") + + code = code_provider(two_factor_info).strip().replace(" ", "") + if not code: + raise LoginError("Two-factor authentication code was empty.") + + device_id = generate_device_id(username) + params = { + "username": username, + "verification_code": code, + "two_factor_identifier": identifier, + "trust_this_device": "1", + "guid": generate_uuid(False), + "device_id": device_id, + "waterfall_id": generate_uuid(False), + "verification_method": _pick_2fa_method(two_factor_info), + } + signed = sign_params(IG_SIG_KEY, SIG_KEY_VERSION, params) + url = API_URL.format(version=V) + "accounts/two_factor_login/" + + try: + r = session.post(url, data=signed, headers={"Content-type": "application/x-www-form-urlencoded; charset=UTF-8"}, timeout=timeout_login) + except requests.RequestException as e: + raise LoginError(f"Network error during two-factor login: {e}") + + try: + j = r.json() + except Exception: + j = {"status": "unknown", "text": r.text} + + if r.status_code != 200: + raise LoginError(f"HTTP {r.status_code} during two-factor login: {j}") + + if not isinstance(j, dict) or not j.get("logged_in_user", {}).get("pk"): + raise LoginError(f"Unable to complete two-factor login: {j}") + + return _extract_cookie_tuple(session) + +def login_instagram( + username: str, + password: str, + timeout_prelogin: int = 10, + timeout_login: int = 20, + two_factor_code_provider: Optional[Callable[[Dict[str, Any]], str]] = None, +) -> Tuple[str, str, str, str]: """ Perform the mobile-app-like login and return tuple: (sessionid, csrftoken, mid, ds_user_id) Raises LoginError on failure with meaningful message. """ + try: + return login_instagram_with_instagrapi(username, password, two_factor_code_provider) + except LoginError as instagrapi_error: + fallback_error = str(instagrapi_error) + s = requests.Session() s.headers.update(default_headers(USER_AGENT, IG_CAPABILITIES, APPLICATION_ID)) @@ -159,7 +313,7 @@ def login_instagram(username: str, password: str, timeout_prelogin: int = 10, ti csrftoken = get_cookie_value(s.cookies, "csrftoken") if not csrftoken: - raise LoginError("Unable to get CSRF from prelogin.") + raise LoginError(f"{fallback_error}; legacy login fallback also failed: Unable to get CSRF from prelogin.") device_id = generate_device_id() login_params = { @@ -194,7 +348,15 @@ def login_instagram(username: str, password: str, timeout_prelogin: int = 10, ti raise LoginError(f"Unexpected login response: {j}") if j.get("two_factor_required"): - raise LoginError("Two-factor authentication required on this account; interactive flow not implemented.") + if two_factor_code_provider is None: + raise LoginError("Two-factor authentication required on this account; provide a 2FA code to continue.") + return _complete_two_factor_login( + s, + username, + j.get("two_factor_info") or {}, + two_factor_code_provider, + timeout_login, + ) if j.get("challenge_required"): raise LoginError("Challenge required by Instagram; solve in-app and retry.") @@ -202,28 +364,28 @@ def login_instagram(username: str, password: str, timeout_prelogin: int = 10, ti if not j.get("logged_in_user", {}).get("pk"): raise LoginError(f"Unable to login: {j}") - # Extract cookies - sessionid = get_cookie_value(s.cookies, "sessionid", domain="instagram.com") - csrftoken = get_cookie_value(s.cookies, "csrftoken", domain="instagram.com") - mid = get_cookie_value(s.cookies, "mid", domain="instagram.com") - dsuserid = get_cookie_value(s.cookies, "ds_user_id", domain="instagram.com") - - if not all([sessionid, csrftoken, mid, dsuserid]): - raise LoginError("Login succeeded but required cookies are missing.") - - return (sessionid, csrftoken, mid, dsuserid) + return _extract_cookie_tuple(s) # ---------- CLI ---------- def main(): ap = argparse.ArgumentParser(description="Instagram app login and dump cookies (standalone)") ap.add_argument("-u", "--username", required=True) - ap.add_argument("-p", "--password", required=True) + ap.add_argument("-p", "--password") + ap.add_argument("--cookie-header", help="Paste an authenticated Instagram Cookie header instead of logging in.") ap.add_argument("--json", action="store_true") args = ap.parse_args() try: - sessionid, csrftoken, mid, dsuserid = login_instagram(args.username, args.password) + if args.cookie_header: + sessionid, csrftoken, mid, dsuserid = parse_cookie_string(args.cookie_header) + else: + password = args.password or getpass.getpass("Instagram password: ") + sessionid, csrftoken, mid, dsuserid = login_instagram( + args.username, + password, + two_factor_code_provider=lambda info: getpass.getpass("Two-factor authentication code: "), + ) if args.json: print(json.dumps( {"sessionid": sessionid, "csrftoken": csrftoken, "mid": mid, "ds_user_id": dsuserid}, diff --git a/main.py b/main.py index a5781e7..3eef518 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import asyncio +import getpass import httpx import json import os @@ -17,6 +18,7 @@ read_cookie_json, write_cookie_json, cookie_json_valid, + parse_cookie_string, LoginError, ) @@ -120,9 +122,7 @@ def write_outputs(base_name: str, comments_flat: List[str], comments_struct: Lis async def refresh_cookies_interactive(shortcode: str) -> Tuple[str, str, str, str, Dict[str, str]]: print("Detected expired/invalid cookies. Please relogin.") - username = input("Enter your username: ").strip() - password = input("Enter your instagram password: ").strip() - si, ct, m, du = login_instagram(username, password) + si, ct, m, du = login_or_cookie_interactive() write_cookie_json(si, ct, m, du) headers = build_headers(shortcode, cookies_string(si, ct, m, du)) print("Refreshed cookies saved.") @@ -235,16 +235,33 @@ def prompt_rps() -> float: except ValueError: print("Enter a numeric value like 1, 2.5, 5.") +def prompt_two_factor_code() -> str: + return getpass.getpass("Enter your two-factor authentication code: ") + +def login_or_cookie_interactive() -> Tuple[str, str, str, str]: + username = input("Enter your username: ").strip() + password = getpass.getpass("Enter your instagram password: ").strip() + try: + return login_instagram( + username, + password, + two_factor_code_provider=lambda info: prompt_two_factor_code(), + ) + except LoginError as exc: + print(f"Automated login failed: {exc}") + print("Paste an authenticated Instagram Cookie header from your browser to continue.") + print("It must include: sessionid, csrftoken, mid, ds_user_id.") + cookie_header = getpass.getpass("Instagram Cookie header: ").strip() + return parse_cookie_string(cookie_header) + def load_or_login_get_cookies_interactive() -> Tuple[str, str, str, str]: dj = read_cookie_json() if cookie_json_valid(dj): c = dj["cookies"] return c["sessionid"], c["csrftoken"], c["mid"], c["ds_user_id"] print("Saved login expired or missing. Please login again.") - username = input("Enter your username: ").strip() - password = input("Enter your instagram password: ").strip() print("Logging in to your account to fetch cookies...") - sessionid, csrftoken, mid, dsuserid = login_instagram(username, password) + sessionid, csrftoken, mid, dsuserid = login_or_cookie_interactive() print("Cookies fetched successfully.") write_cookie_json(sessionid, csrftoken, mid, dsuserid) return sessionid, csrftoken, mid, dsuserid diff --git a/requirements.txt b/requirements.txt index 98587f6..16c3f3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -tqdm -httpx -requests - +tqdm>=4.66,<5 +httpx[http2]>=0.27,<1 +requests>=2.31,<3 +instagrapi>=2.1,<3