Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 77 additions & 107 deletions scrapers/MyMemberSite/MyMemberSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,11 @@
}
)

def __api_request(scrape_url: str) -> dict:

def _fetch_page(scrape_url: str) -> str:
"""
Makes a GET request to the given URL and returns the JSON response.
Fetches the page at scrape_url and returns the HTML text.
Exits if the request fails or if the response status is not 200.

Parameters
----------
scrape_url : str
The URL to fetch.

Returns
-------
dict
The JSON response from the API.
"""
log.debug(f"Fetching '{scrape_url}'")
try:
Expand All @@ -46,103 +37,76 @@ def __api_request(scrape_url: str) -> dict:
log.error(f"Fetching '{scrape_url}' resulted in error status: {response.status_code}")
sys.exit(-1)

data = response.json()
log.trace(f"Raw data from API: {data}")
return data


def __parse_url(api_url: str, scraped_url: str) -> str:
"""
Returns the studio name and the API URL corresponding to the given URL.

Exits if the domain is not known or if the path does not conform to the expected format
return response.text

Parameters
----------
api_url : str
The base URL of the API.
scraped_url : str
The URL to parse.

Returns
-------
str
The API URL corresponding to the given URL.
def _extract_nextjs_video_data(html: str) -> dict | None:
"""
path = urlparse(scraped_url).path
if not (
match := re.search(
r"\/(?P<type>videos|photosets)\/(?P<id>\d+)(?:-(?P<name>.+))?$", path
)
):
log.error(f"Unable to parse URL '{scraped_url}'")
sys.exit(-1)

_type, _id, slug = match.groups()

return f"{api_url}/{_type}/{_id}"


def __parse_api_url(scrape_url: str) -> str:
Parse the Next.js __next_f RSC payload embedded in server-rendered HTML to
extract scene/gallery data.
"""
Fetches the page and extracts the subdomain meta tag to determine the API URL.
Exits if the page cannot be fetched or if the meta tag is not found.

Parameters
----------
scrape_url : str
The URL of the page to fetch.

Returns
-------
str
The API URL extracted from the meta tag.
"""
response = requests.get(scrape_url)
if response.status_code != 200:
log.error(f"Fetching '{scrape_url}' resulted in error status: {response.status_code}")
sys.exit(-1)

html = BeautifulSoup(response.content, "html.parser")
subdomain_tag = html.find("meta", {"name": "subdomain"})
if not subdomain_tag or "content" not in subdomain_tag.attrs:
log.error(f"Meta tag with name 'subdomain' not found in '{scrape_url}'")
sys.exit(-1)

return f"https://{subdomain_tag['content']}.mymember.site/api"

pattern = re.compile(
r'self\.__next_f\.push\(\[1,\s*"((?:[^"\\]|\\.)*)"\]\)', re.DOTALL
)

def __fetch_studio(api_url: str, scrape_url: str) -> ScrapedStudio:
for m in pattern.finditer(html):
raw = m.group(1)
try:
decoded = json.loads('"' + raw + '"')
except Exception:
continue

if '"pageType":"video"' not in decoded and '"pageType":"photoset"' not in decoded:
continue

data_match = re.search(r'"data":(\{)', decoded)
if not data_match:
continue

start = data_match.start(1)
depth = 0
for i, ch in enumerate(decoded[start:], start):
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
try:
return json.loads(decoded[start : i + 1])
except Exception:
break

return None


def _fetch_studio(html: str, scrape_url: str) -> ScrapedStudio:
"""
Fetches the studio information from the API.
Exits if the API request fails or if the site_info is not found in the response.

Parameters
----------
api_url : str
The base URL of the API.
scrape_url : str
The original URL being scraped, used to construct the studio URL.

Returns
-------
ScrapedStudio
The scraped studio data.
Extract studio name and URL from __next_f payload.
"""
response = __api_request(f"{api_url}/auth/init")
if "site_info" not in response:
log.error(f"site_info not found in '{api_url}/api/auth/init' response")
sys.exit(-1)

parsed_url = urlparse(scrape_url)
studio_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
if parsed_url.netloc == "mymember.site":
studio_url += f"/{parsed_url.path.split('/', 3)[1]}"

pattern = re.compile(
r'self\.__next_f\.push\(\[1,\s*"((?:[^"\\]|\\.)*)"\]\)', re.DOTALL
)
for m in pattern.finditer(html):
raw = m.group(1)
try:
decoded = json.loads('"' + raw + '"')
except Exception:
continue
if '"site_long_name"' not in decoded:
continue
name_m = re.search(r'"site_long_name":"([^"]+)"', decoded)
url_m = re.search(r'"site_url":"(https?://[^"]+)"', decoded)
if name_m:
parsed = urlparse(scrape_url)
return ScrapedStudio(
name=name_m.group(1),
url=url_m.group(1) if url_m else f"{parsed.scheme}://{parsed.netloc}",
)

parsed = urlparse(scrape_url)
return ScrapedStudio(
name=response["site_info"].get("site_long_name", "MyMemberSite"),
url=studio_url,
image=response["site_info"].get("site_logo", ""),
name=parsed.netloc,
url=f"{parsed.scheme}://{parsed.netloc}",
)


Expand All @@ -160,9 +124,12 @@ def gallery_from_url(gallery_url: str) -> ScrapedGallery:
ScrapedGallery
The scraped gallery data.
"""
api_url: str = __parse_api_url(gallery_url)
studio: ScrapedStudio = __fetch_studio(api_url, gallery_url)
raw_gallery = __api_request(__parse_url(api_url, gallery_url))
html = _fetch_page(gallery_url)
studio: ScrapedStudio = _fetch_studio(html, gallery_url)
raw_gallery = _extract_nextjs_video_data(html)
if not raw_gallery:
log.error(f"Could not extract gallery data from Next.js payload at '{gallery_url}'")
sys.exit(-1)

scraped: ScrapedGallery = {}

Expand Down Expand Up @@ -203,9 +170,12 @@ def scene_from_url(scene_url: str) -> ScrapedScene:
ScrapedScene
The scraped scene data.
"""
api_url: str = __parse_api_url(scene_url)
studio: ScrapedStudio = __fetch_studio(api_url, scene_url)
raw_scene = __api_request(__parse_url(api_url, scene_url))
html = _fetch_page(scene_url)
studio: ScrapedStudio = _fetch_studio(html, scene_url)
raw_scene = _extract_nextjs_video_data(html)
if not raw_scene:
log.error(f"Could not extract video data from Next.js payload at '{scene_url}'")
sys.exit(-1)

scraped: ScrapedScene = {}

Expand Down
3 changes: 3 additions & 0 deletions scrapers/MyMemberSite/MyMemberSite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ galleryByURL:
- loonerlove.com/
- lorenzoviota.com/
- lovehershoes.club/
- lunaevans.com/
- lukespov.vip/
- luxurioussins.com/
- madameroyalehamburg.com/
Expand Down Expand Up @@ -222,6 +223,7 @@ galleryByURL:
- monajade.com/
- moonroxfans.com/
- moonssecretgarden.com/
- mranalizeher.com/
- mrhappyendings.com/
- mrhappyendingspov.com/
- mrharrylong.com/
Expand All @@ -231,6 +233,7 @@ galleryByURL:
- mrrainsexfights.com/
- mshift.media/
- msskinlive.com/
- murkovskihub.com/
- musebyrayhapa.com/
- mymember.site/
- nastila.net/
Expand Down