Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.git
.gitignore
__pycache__/
*.py[cod]
*.pyo
.Python
.env
cookie.json
download_comments/
data/
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__pycache__
cookie.json
.env
data/
download_comments/
22 changes: 22 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM python:3.12-slim AS runtime

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1

WORKDIR /app

RUN groupadd --system app && useradd --system --gid app --home-dir /data app

COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r requirements.txt

COPY login.py main.py ./

RUN mkdir -p /data && chown -R app:app /app /data

USER app
WORKDIR /data

CMD ["python", "/app/main.py"]
31 changes: 29 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Scrape **all parent comments** from any Instagram Reel with **automated login**,
## 📦 Requirements

- Python **3.9+**
- Docker + Docker Compose, optional
- Dependencies:

```bash
Expand All @@ -50,7 +51,32 @@ python3 main.py
```
* Enter the Instagram Reel URL (e.g., https://www.instagram.com/reel/SHORTCODE/).
* Set Max requests per second (5-7 recommended). Adjust for stability.
* On first run, provide username/password; cookie.json is created and reused until expiry.
* On first run, provide username/password and a 2FA code if prompted; `cookie.json` is created and reused until expiry.

## 🐳 Docker Usage

Run the scraper without installing Python dependencies on your host:

```bash
docker compose run --rm instascrape
```

The container runs as a non-root user and stores runtime files in `./data`:

* `./data/cookie.json`
* `./data/download_comments/txt/...`
* `./data/download_comments/json/...`

Rebuild after dependency changes:

```bash
docker compose build
```

If automated login cannot complete, the CLI can import an authenticated
Instagram `Cookie` header from a logged-in browser request. The header must
include `sessionid`, `csrftoken`, `mid`, and `ds_user_id`; it is then stored in
`./data/cookie.json`.

## 📁 Output
* TXT: download_comments/txt/reel_comments_YYYYMMDD_HHMMSS.txt
Expand All @@ -67,6 +93,7 @@ Example JSON structure:
```
---
## 🔧 How it Works
* Login: uses `instagrapi` for current Instagram login and 2FA support, with a cookie-header fallback when automated login is blocked.
* Cookie Lifecycle: cookie.json stores iat and expiry; validated on startup & during requests.
* Error Resilience: retries transient errors and refreshes cookies on 401/redirect-to-login.
* Progress Accuracy: uses Instagram’s comment count to calculate percent & ETA.
Expand All @@ -75,7 +102,7 @@ Example JSON structure:
## 💡 Tips
* Start with 5-7 RPS to minimize throttling; increase gradually.
* Filenames use local time; switch to UTC by replacing datetime.now() with datetime.utcnow() in main.py.
* If login fails with `challenge_required`, open Instagram in the official app or browser, approve the login challenge, then retry.
---
## ⚠️ Disclaimer
Use responsibly. Comply with Instagram’s Terms of Service. Intended for personal or permitted use only.

11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
services:
instascrape:
build:
context: .
image: instascrape:local
stdin_open: true
tty: true
volumes:
- ./data:/data
environment:
PYTHONUNBUFFERED: "1"
196 changes: 179 additions & 17 deletions login.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@
# -*- coding: utf-8 -*-

import argparse
import getpass
import json
import time
import hmac
import hashlib
import uuid
import random
import sys
from typing import Optional, Dict, Any, Tuple
from typing import Callable, Optional, Dict, Any, Tuple

import requests

API_URL = "https://i.instagram.com/api/{version}/"
V = "v1"
USER_AGENT = "Instagram 123.0.0.0 Android (30/11; 420dpi; 1080x1920; Google; Pixel; sailfish; qcom; en_US)"
# Keep this close to a current official Android release. Instagram can block
# login attempts that look like very old app builds, especially for EU users.
USER_AGENT = "Instagram 430.0.0.19.80 Android (35/15; 420dpi; 1080x2400; Google/google; Pixel 8; shiba; shiba; en_US; 123456789)"
IG_SIG_KEY = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
SIG_KEY_VERSION = "4"
IG_CAPABILITIES = "3brTvw"
Expand Down Expand Up @@ -134,17 +137,168 @@ def cookie_json_valid(d: Optional[Dict[str, Any]]) -> bool:
required_ok = all(k in c and isinstance(c.get(k), str) and c.get(k) for k in ("sessionid", "csrftoken", "mid", "ds_user_id"))
return bool(required_ok and isinstance(overall, int) and overall > now)

def parse_cookie_string(cookie_header: str) -> Tuple[str, str, str, str]:
parts: Dict[str, str] = {}
for item in cookie_header.split(";"):
if "=" not in item:
continue
key, value = item.split("=", 1)
parts[key.strip()] = value.strip()

missing = [k for k in ("sessionid", "csrftoken", "mid", "ds_user_id") if not parts.get(k)]
if missing:
raise LoginError(f"Cookie string is missing required values: {', '.join(missing)}")

return parts["sessionid"], parts["csrftoken"], parts["mid"], parts["ds_user_id"]

# ---------- Login Core ----------

class LoginError(Exception):
pass

def login_instagram(username: str, password: str, timeout_prelogin: int = 10, timeout_login: int = 20) -> Tuple[str, str, str, str]:
def _cookie_tuple_from_mapping(cookies: Dict[str, Any]) -> Optional[Tuple[str, str, str, str]]:
values = {k: str(cookies.get(k) or "") for k in ("sessionid", "csrftoken", "mid", "ds_user_id")}
if all(values.values()):
return values["sessionid"], values["csrftoken"], values["mid"], values["ds_user_id"]
return None

def _cookie_tuple_from_jar(jar: requests.cookies.RequestsCookieJar) -> Optional[Tuple[str, str, str, str]]:
sessionid = get_cookie_value(jar, "sessionid", domain="instagram.com")
csrftoken = get_cookie_value(jar, "csrftoken", domain="instagram.com")
mid = get_cookie_value(jar, "mid", domain="instagram.com")
dsuserid = get_cookie_value(jar, "ds_user_id", domain="instagram.com")
if all([sessionid, csrftoken, mid, dsuserid]):
return sessionid, csrftoken, mid, dsuserid
return None

def _extract_cookie_tuple(session: requests.Session) -> Tuple[str, str, str, str]:
result = _cookie_tuple_from_jar(session.cookies)
if result:
return result

raise LoginError("Login succeeded but required cookies are missing.")

def _extract_instagrapi_cookies(client: Any) -> Tuple[str, str, str, str]:
for attr in ("private", "public"):
session = getattr(client, attr, None)
jar = getattr(session, "cookies", None)
if jar is not None:
result = _cookie_tuple_from_jar(jar)
if result:
return result

try:
settings = client.get_settings()
except Exception:
settings = {}

cookies = settings.get("cookies") if isinstance(settings, dict) else None
if isinstance(cookies, dict):
result = _cookie_tuple_from_mapping(cookies)
if result:
return result

raise LoginError("instagrapi login succeeded but required cookies are missing.")

def login_instagram_with_instagrapi(
username: str,
password: str,
two_factor_code_provider: Optional[Callable[[Dict[str, Any]], str]] = None,
) -> Tuple[str, str, str, str]:
try:
from instagrapi import Client
from instagrapi.exceptions import ChallengeRequired, TwoFactorRequired
except Exception as exc:
raise LoginError(f"instagrapi is not available: {exc}")

client = Client()
try:
client.login(username, password)
except TwoFactorRequired:
if two_factor_code_provider is None:
raise LoginError("Two-factor authentication required on this account; provide a 2FA code to continue.")
code = two_factor_code_provider({}).strip().replace(" ", "")
if not code:
raise LoginError("Two-factor authentication code was empty.")
client.login(username, password, verification_code=code)
except ChallengeRequired as exc:
raise LoginError(f"Challenge required by Instagram; solve in-app and retry. Details: {exc}")
except Exception as exc:
raise LoginError(f"instagrapi login failed: {exc}")

return _extract_instagrapi_cookies(client)

def _pick_2fa_method(two_factor_info: Dict[str, Any]) -> str:
if two_factor_info.get("totp_two_factor_on"):
return "3"
if two_factor_info.get("sms_two_factor_on"):
return "1"
return "3"

def _complete_two_factor_login(
session: requests.Session,
username: str,
two_factor_info: Dict[str, Any],
code_provider: Callable[[Dict[str, Any]], str],
timeout_login: int,
) -> Tuple[str, str, str, str]:
identifier = two_factor_info.get("two_factor_identifier")
if not identifier:
raise LoginError("Two-factor authentication required but Instagram did not return a two_factor_identifier.")

code = code_provider(two_factor_info).strip().replace(" ", "")
if not code:
raise LoginError("Two-factor authentication code was empty.")

device_id = generate_device_id(username)
params = {
"username": username,
"verification_code": code,
"two_factor_identifier": identifier,
"trust_this_device": "1",
"guid": generate_uuid(False),
"device_id": device_id,
"waterfall_id": generate_uuid(False),
"verification_method": _pick_2fa_method(two_factor_info),
}
signed = sign_params(IG_SIG_KEY, SIG_KEY_VERSION, params)
url = API_URL.format(version=V) + "accounts/two_factor_login/"

try:
r = session.post(url, data=signed, headers={"Content-type": "application/x-www-form-urlencoded; charset=UTF-8"}, timeout=timeout_login)
except requests.RequestException as e:
raise LoginError(f"Network error during two-factor login: {e}")

try:
j = r.json()
except Exception:
j = {"status": "unknown", "text": r.text}

if r.status_code != 200:
raise LoginError(f"HTTP {r.status_code} during two-factor login: {j}")

if not isinstance(j, dict) or not j.get("logged_in_user", {}).get("pk"):
raise LoginError(f"Unable to complete two-factor login: {j}")

return _extract_cookie_tuple(session)

def login_instagram(
username: str,
password: str,
timeout_prelogin: int = 10,
timeout_login: int = 20,
two_factor_code_provider: Optional[Callable[[Dict[str, Any]], str]] = None,
) -> Tuple[str, str, str, str]:
"""
Perform the mobile-app-like login and return tuple:
(sessionid, csrftoken, mid, ds_user_id)
Raises LoginError on failure with meaningful message.
"""
try:
return login_instagram_with_instagrapi(username, password, two_factor_code_provider)
except LoginError as instagrapi_error:
fallback_error = str(instagrapi_error)

s = requests.Session()
s.headers.update(default_headers(USER_AGENT, IG_CAPABILITIES, APPLICATION_ID))

Expand All @@ -159,7 +313,7 @@ def login_instagram(username: str, password: str, timeout_prelogin: int = 10, ti

csrftoken = get_cookie_value(s.cookies, "csrftoken")
if not csrftoken:
raise LoginError("Unable to get CSRF from prelogin.")
raise LoginError(f"{fallback_error}; legacy login fallback also failed: Unable to get CSRF from prelogin.")

device_id = generate_device_id()
login_params = {
Expand Down Expand Up @@ -194,36 +348,44 @@ def login_instagram(username: str, password: str, timeout_prelogin: int = 10, ti
raise LoginError(f"Unexpected login response: {j}")

if j.get("two_factor_required"):
raise LoginError("Two-factor authentication required on this account; interactive flow not implemented.")
if two_factor_code_provider is None:
raise LoginError("Two-factor authentication required on this account; provide a 2FA code to continue.")
return _complete_two_factor_login(
s,
username,
j.get("two_factor_info") or {},
two_factor_code_provider,
timeout_login,
)

if j.get("challenge_required"):
raise LoginError("Challenge required by Instagram; solve in-app and retry.")

if not j.get("logged_in_user", {}).get("pk"):
raise LoginError(f"Unable to login: {j}")

# Extract cookies
sessionid = get_cookie_value(s.cookies, "sessionid", domain="instagram.com")
csrftoken = get_cookie_value(s.cookies, "csrftoken", domain="instagram.com")
mid = get_cookie_value(s.cookies, "mid", domain="instagram.com")
dsuserid = get_cookie_value(s.cookies, "ds_user_id", domain="instagram.com")

if not all([sessionid, csrftoken, mid, dsuserid]):
raise LoginError("Login succeeded but required cookies are missing.")

return (sessionid, csrftoken, mid, dsuserid)
return _extract_cookie_tuple(s)

# ---------- CLI ----------

def main():
ap = argparse.ArgumentParser(description="Instagram app login and dump cookies (standalone)")
ap.add_argument("-u", "--username", required=True)
ap.add_argument("-p", "--password", required=True)
ap.add_argument("-p", "--password")
ap.add_argument("--cookie-header", help="Paste an authenticated Instagram Cookie header instead of logging in.")
ap.add_argument("--json", action="store_true")
args = ap.parse_args()

try:
sessionid, csrftoken, mid, dsuserid = login_instagram(args.username, args.password)
if args.cookie_header:
sessionid, csrftoken, mid, dsuserid = parse_cookie_string(args.cookie_header)
else:
password = args.password or getpass.getpass("Instagram password: ")
sessionid, csrftoken, mid, dsuserid = login_instagram(
args.username,
password,
two_factor_code_provider=lambda info: getpass.getpass("Two-factor authentication code: "),
)
if args.json:
print(json.dumps(
{"sessionid": sessionid, "csrftoken": csrftoken, "mid": mid, "ds_user_id": dsuserid},
Expand Down
Loading