From 26ffbd34a85ada5774d261edb2bf7821d8e8aba1 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 12:11:34 +0100 Subject: [PATCH 01/10] fix scene list scraping Co-authored-by: Copilot --- scrapers/DreamTranny.yml | 47 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/scrapers/DreamTranny.yml b/scrapers/DreamTranny.yml index 7efbe952d..8a4f3a2c6 100644 --- a/scrapers/DreamTranny.yml +++ b/scrapers/DreamTranny.yml @@ -24,20 +24,6 @@ performerByURL: - dreamtranny.com/models/ scraper: performerScraper xPathScrapers: - sceneListScraper: - scene: - Title: //div[contains(@class, "video-item")]//div[@class="item-content"]//a - Image: //div[contains(@class, "video-item")]//a[@class="item-thumb"]/img/@src - Date: - selector: //div[contains(@class, "video-item")]//div[@class="item-content"]/div[2]/p - postProcess: - - parseDate: Jan 2, 2006 - URL: - selector: //div[contains(@class, "video-item")]//div[@class="item-content"]//a/@href - postProcess: - - replace: - - regex: ^(/.*) - with: https://dreamtranny.com$1 sceneScraper: scene: Title: &titleSel //div[@class="section-title"]/h4/text() @@ -76,6 +62,34 @@ xPathScrapers: - replace: - regex: .*/api/update/(\d+)/view_count.* with: "$1" + sceneListScraper: + common: + $videoItemContent: //div[contains(@class, "video-item")]//div[@class="item-content"] + $performerLink: //div[contains(@class, "video-item")]//div[@class="item-content"]//a[contains(@href, "/models/")] + $videoLink: //div[contains(@class, "video-item")]//div[@class="item-content"]//a[contains(@href, "/update/")] + scene: + Title: $videoLink/text() + Image: //div[contains(@class, "video-item")]//a[@class="item-thumb"]/img/@src + Date: + selector: $videoItemContent/div[2]/p + postProcess: + - parseDate: Jan 2, 2006 + URL: + selector: $videoLink/@href + postProcess: &urlPostProcess + - replace: + - regex: ^(/.*) + with: https://dreamtranny.com$1 + Studio: *studioAttr + # The following are not available on the listing page, so we will grab them from the scene page using the scene URL + Details: + selector: $videoLink/@href + postProcess: + - replace: + - regex: ^(/.*) + with: https://dreamtranny.com$1 + - subScraper: + selector: *detailsSel galleryScraper: gallery: Title: *titleSel @@ -92,10 +106,7 @@ xPathScrapers: fixed: transgender_female Image: selector: //div[@class="model-img"]/a/img[@class="img"]/@src - postProcess: - - replace: - - regex: ^(/.*) - with: https://dreamtranny.com/$1 + postProcess: *urlPostProcess Country: //div[@class="model-content"]/p/span[text()="NATIONALITY"]/following-sibling::span[1] Birthdate: selector: //div[@class="model-content"]/p/span[text()="DATE OF BIRTH"]/following-sibling::span[1] From 13eb2e52b39f363cb65c69209a913f35d5be9525 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 13:27:20 +0100 Subject: [PATCH 02/10] start refactoring to python Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 155 +++++++++++++++++++++ scrapers/{ => DreamTranny}/DreamTranny.yml | 18 ++- 2 files changed, 167 insertions(+), 6 deletions(-) create mode 100644 scrapers/DreamTranny/DreamTranny.py rename scrapers/{ => DreamTranny}/DreamTranny.yml (92%) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py new file mode 100644 index 000000000..7d896a2f2 --- /dev/null +++ b/scrapers/DreamTranny/DreamTranny.py @@ -0,0 +1,155 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +import json +import os +import re +import sys +from typing import Any + +from py_common.deps import ensure_requirements +ensure_requirements("bs4:beautifulsoup4", "requests") +import py_common.log as log +from py_common.types import ScrapedGallery, ScrapedGroup, ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag +from py_common.util import dig, guess_nationality, scraper_args + +import requests +from bs4 import BeautifulSoup as bs + +STUDIO = ScrapedStudio(name="Dream Tranny", url="https://www.dreamtranny.com") + +def relative_url_to_absolute(base_url: str, relative_url: str) -> str: + log.debug(f"Converting relative URL to absolute: base_url={base_url}, relative_url={relative_url}") + if relative_url.startswith("http"): + return relative_url + return requests.compat.urljoin(base_url, relative_url) + +def scene_from_url(url: str) -> ScrapedScene: + sess = requests.Session() + res = sess.get(url) + res.raise_for_status() + soup = bs(res.text, "html.parser") + + scene = ScrapedScene(studio=STUDIO) + + # title from xpath //div[@class="section-title"]/h4/text() + if title_elem := soup.select_one("div.section-title > h4"): + scene["title"] = title_elem.get_text(strip=True) + else: + log.warning("Title not found") + + # details from xpath //p[@class="read-more"]/text() + details = None + if details_elem := soup.select_one("p.read-more"): + details = details_elem.get_text(strip=True) + scene["details"] = details + else: + log.warning("Details not found") + + # date from xpath //small[@class="updated-at"]/text() with format "Jan 2, 2006" + if date_elem := soup.select_one("small.updated-at"): + date_str = date_elem.get_text(strip=True) + try: + # parse string to a datetime object + date_obj = datetime.strptime(date_str, "%b %d, %Y") + # format date as "YYYY-MM-DD" + scene["date"] = date_obj.strftime("%Y-%m-%d") + except Exception as ex: + log.warning(f"Error parsing date: {ex}") + else: + log.warning("Date not found") + + # performers from xpath //a[contains(@class, "model-name")] with name from text() and url from @href + if performer_links := soup.select("a.model-name"): + performers: list[ScrapedPerformer] = [] + for el in performer_links: + p_name = el.get_text(strip=True) + p_url = el.get("href") + if p_name and p_url: + log.debug(f"Found performer: name={p_name}, url={p_url}") + performers.append(ScrapedPerformer(name=p_name, url=p_url)) + # male performers are often mentions in the description + # so we can try to extract them from there using a regex pattern for likely full names + if details and (description_performers := re.findall("([A-Z][a-z]+ [A-Z][a-z]+)", details, re.DOTALL)): + for dp in description_performers: + if not any(dp == p["name"] for p in performers): + log.debug(f"Adding performer from description: {dp}") + performers.append(ScrapedPerformer(name=dp)) + scene["performers"] = performers + else: + log.warning("Performers not found") + + + # cover image from xpath: + # //video[contains(@class,"video-js")]/@poster + # or + # //div[contains(@class,"model-player")]//img/@src + # or + # //video[contains(@class,"vjs")]/@poster + cover_url = None + if cover_url_elem := soup.select_one("video.video-js"): + cover_url = cover_url_elem.get("poster") + scene["image"] = relative_url_to_absolute(url, cover_url) + elif cover_url_elem := soup.select_one("div.model-player img"): + cover_url = cover_url_elem.get("src") + scene["image"] = relative_url_to_absolute(url, cover_url) + elif cover_url_elem := soup.select_one("video.vjs"): + cover_url = cover_url_elem.get("poster") + scene["image"] = relative_url_to_absolute(url, cover_url) + else: + log.warning("Cover image not found") + + # tags from xpath //div[@class="model-categories"]/a/text() + if tag_elements := soup.select("div.model-categories > a"): + tags = [ScrapedTag(name=el.get_text(strip=True)) for el in tag_elements] + scene["tags"] = tags + else: + log.warning("Tags not found") + + # url from xpath //script[contains(.,"API_VIEW_URLS")]/text() + # with regex pattern .*/api(/update/\d+)/view_count.* + if script_elements := soup.select("script"): + for el in script_elements: + if el.string and "API_VIEW_URLS" in el.string: + match = re.search(r'.*/api(/update/(\d+)/)view_count.*', el.string) + if match: + absolute_url = relative_url_to_absolute(url, match.group(1)) + log.debug(f"Extracted scene URL: {absolute_url}") + scene["url"] = absolute_url + scene["urls"] = [absolute_url] + scene["code"] = match.group(2) + break + if "url" not in scene: + log.warning("Scene URL not found in scripts") + else: + log.warning("Script elements not found for URL extraction") + + return scene + +if __name__ == "__main__": + op, args = scraper_args() + + log.debug(f"args: {args}") + match op, args: + # case "gallery-by-url", {"url": url} if url: + # result = gallery_from_url(url) + # case "gallery-by-fragment", args: + # result = gallery_from_fragment(args) + # case "group-by-url", {"url": url} if url: + # result = group_from_url(url) + case "scene-by-url", {"url": url} if url: + result = scene_from_url(url) + # case "scene-by-name", {"name": name} if name: + # result = scene_search(name) + # case "scene-by-fragment" | "scene-by-query-fragment", args: + # result = scene_from_fragment(args) + # case "performer-by-url", {"url": url}: + # result = performer_from_url(url) + # case "performer-by-fragment", args: + # result = performer_from_fragment(args) + # case "performer-by-name", {"name": name} if name: + # result = performer_search(name) + case _: + log.error(f"Operation: {op}, arguments: {json.dumps(args)}") + sys.exit(1) + + print(json.dumps(result)) \ No newline at end of file diff --git a/scrapers/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml similarity index 92% rename from scrapers/DreamTranny.yml rename to scrapers/DreamTranny/DreamTranny.yml index 8a4f3a2c6..d37ccc390 100644 --- a/scrapers/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=../validator/scraper.schema.json +# yaml-language-server: $schema=../../validator/scraper.schema.json name: DreamTranny sceneByName: action: scrapeXPath @@ -9,10 +9,13 @@ sceneByQueryFragment: queryURL: "{url}" scraper: sceneScraper sceneByURL: - - action: scrapeXPath + - action: script url: - dreamtranny.com - scraper: sceneScraper + script: + - python + - DreamTranny.py + - scene-by-url galleryByURL: - action: scrapeXPath url: @@ -50,12 +53,15 @@ xPathScrapers: fixed: Dream Tranny Tags: &tagsAttr Name: //div[@class="model-categories"]/a/text() - URL: &urlAttr + URLs: &urlAttr selector: &urlSel //script[contains(.,"API_VIEW_URLS")]/text() - postProcess: + postProcess: &sceneURLPostProcess - replace: - regex: .*/api(/update/\d+)/view_count.* with: "https://dreamtranny.com$1/" + # URLs: + # selector: *urlSel + # postProcess: *sceneURLPostProcess Code: selector: *urlSel postProcess: @@ -119,4 +125,4 @@ driver: # site uses age verification for regions including UK and US # using CDP (set to true) with a VPN configured to another region, e.g. NL, will bypass this useCDP: false -# Last Updated August 1, 2025 +# Last Updated April 30, 2026 From cbea92646a44688149b3c9a40892e58eafafdd1f Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 14:11:20 +0100 Subject: [PATCH 03/10] implement scene-by-name and scene-by-query-fragment Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 131 +++++++++++++++++++-------- scrapers/DreamTranny/DreamTranny.yml | 16 ++-- 2 files changed, 102 insertions(+), 45 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index 7d896a2f2..eae2bb97e 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -13,15 +13,37 @@ from py_common.util import dig, guess_nationality, scraper_args import requests -from bs4 import BeautifulSoup as bs +from bs4 import BeautifulSoup as bs, Tag STUDIO = ScrapedStudio(name="Dream Tranny", url="https://www.dreamtranny.com") -def relative_url_to_absolute(base_url: str, relative_url: str) -> str: - log.debug(f"Converting relative URL to absolute: base_url={base_url}, relative_url={relative_url}") - if relative_url.startswith("http"): - return relative_url - return requests.compat.urljoin(base_url, relative_url) +def url_to_absolute(base_url: str, url: str) -> str: + log.debug(f"Converting URL to absolute: base_url={base_url}, relative_url={url}") + if url.startswith("http"): + log.debug(f"URL is already absolute: {url}") + return url + absolute_url = requests.compat.urljoin(base_url, url) + log.debug(f"Converted URL to absolute: {absolute_url}") + return absolute_url + +def scrape_scene_date(date_elem: Tag): + date_str = date_elem.get_text(strip=True) + try: + date_obj = datetime.strptime(date_str, "%b %d, %Y") + return date_obj.strftime("%Y-%m-%d") + except Exception as ex: + log.warning(f"Error parsing date from search results: {ex}") + return None + +def scrape_performers(performer_links): + performers: list[ScrapedPerformer] = [] + for el in performer_links: + p_name = el.get_text(strip=True) + p_url = el.get("href") + if p_name and p_url: + log.debug(f"Found performer: name={p_name}, url={p_url}") + performers.append(ScrapedPerformer(name=p_name, url=p_url)) + return performers def scene_from_url(url: str) -> ScrapedScene: sess = requests.Session() @@ -47,38 +69,16 @@ def scene_from_url(url: str) -> ScrapedScene: # date from xpath //small[@class="updated-at"]/text() with format "Jan 2, 2006" if date_elem := soup.select_one("small.updated-at"): - date_str = date_elem.get_text(strip=True) - try: - # parse string to a datetime object - date_obj = datetime.strptime(date_str, "%b %d, %Y") - # format date as "YYYY-MM-DD" - scene["date"] = date_obj.strftime("%Y-%m-%d") - except Exception as ex: - log.warning(f"Error parsing date: {ex}") + scene["date"] = scrape_scene_date(date_elem) else: log.warning("Date not found") # performers from xpath //a[contains(@class, "model-name")] with name from text() and url from @href if performer_links := soup.select("a.model-name"): - performers: list[ScrapedPerformer] = [] - for el in performer_links: - p_name = el.get_text(strip=True) - p_url = el.get("href") - if p_name and p_url: - log.debug(f"Found performer: name={p_name}, url={p_url}") - performers.append(ScrapedPerformer(name=p_name, url=p_url)) - # male performers are often mentions in the description - # so we can try to extract them from there using a regex pattern for likely full names - if details and (description_performers := re.findall("([A-Z][a-z]+ [A-Z][a-z]+)", details, re.DOTALL)): - for dp in description_performers: - if not any(dp == p["name"] for p in performers): - log.debug(f"Adding performer from description: {dp}") - performers.append(ScrapedPerformer(name=dp)) - scene["performers"] = performers + scene["performers"] = scrape_performers(performer_links) else: log.warning("Performers not found") - # cover image from xpath: # //video[contains(@class,"video-js")]/@poster # or @@ -88,13 +88,13 @@ def scene_from_url(url: str) -> ScrapedScene: cover_url = None if cover_url_elem := soup.select_one("video.video-js"): cover_url = cover_url_elem.get("poster") - scene["image"] = relative_url_to_absolute(url, cover_url) + scene["image"] = url_to_absolute(url, cover_url) elif cover_url_elem := soup.select_one("div.model-player img"): cover_url = cover_url_elem.get("src") - scene["image"] = relative_url_to_absolute(url, cover_url) + scene["image"] = url_to_absolute(url, cover_url) elif cover_url_elem := soup.select_one("video.vjs"): cover_url = cover_url_elem.get("poster") - scene["image"] = relative_url_to_absolute(url, cover_url) + scene["image"] = url_to_absolute(url, cover_url) else: log.warning("Cover image not found") @@ -112,8 +112,7 @@ def scene_from_url(url: str) -> ScrapedScene: if el.string and "API_VIEW_URLS" in el.string: match = re.search(r'.*/api(/update/(\d+)/)view_count.*', el.string) if match: - absolute_url = relative_url_to_absolute(url, match.group(1)) - log.debug(f"Extracted scene URL: {absolute_url}") + absolute_url = url_to_absolute(url, match.group(1)) scene["url"] = absolute_url scene["urls"] = [absolute_url] scene["code"] = match.group(2) @@ -125,6 +124,60 @@ def scene_from_url(url: str) -> ScrapedScene: return scene +def scene_search(name: str) -> list[ScrapedScene]: + sess = requests.Session() + search_url = f"https://www.dreamtranny.com/updates/?q={requests.utils.quote(name)}" + res = sess.get(search_url) + res.raise_for_status() + soup = bs(res.text, "html.parser") + + scenes: list[ScrapedScene] = [] + # scene items from xpath //div[contains(@class, "video-item")] + if scene_results := soup.select("div.video-item"): + for el in scene_results: + scene = ScrapedScene(studio=STUDIO) + # scene link from relative xpath //div[@class="item-content"]//a[contains(@href, "/update/")]/text() + if scene_link := el.select_one('div.item-content a[href*="/update/"]'): + if scene_url := scene_link.get("href"): + log.debug(f"Found scene URL in search results: {scene_url}") + scene["url"] = url_to_absolute(search_url, scene_url) + if scene_title := scene_link.get_text(strip=True): + scene["title"] = scene_title + # cover image from relative xpath //a[@class="item-thumb"]/img/@src + if cover_img := el.select_one('a.item-thumb img'): + if cover_url := cover_img.get("src"): + scene["image"] = url_to_absolute(search_url, cover_url) + # date from relative xpath //div[@class="item-content"]/div[2]/p + if date_elem := el.select_one("div.item-content > div:nth-child(2) > p"): + scene["date"] = scrape_scene_date(date_elem) + # performers from relative xpath //div[@class="item-content"]//a[contains(@href, "/models/")] with name from text() and url from @href + if performer_links := el.select('div.item-content a[href*="/models/"]'): + scene["performers"] = scrape_performers(performer_links) + scenes.append(scene) + else: + log.warning("Scene links not found in search results") + + return scenes + +def scene_from_fragment(args: dict[str, Any]) -> list[ScrapedScene]: + # if url is provided, extract scene ID and call scene_from_url + if url := args.get("url"): + log.debug(f"Extracting scene from URL fragment: {url}") + return scene_from_url(url) + + # if name is provided, call scene_search + if name := args.get("name"): + log.debug(f"Searching for scene by name fragment: {name}") + if search_results := scene_search(name): + log.debug(f"Found {len(search_results)} search results for name: {name}") + return search_results[0] + else: + log.warning(f"No search results found for name: {name}") + return None + + log.error(f"No valid fragment provided in arguments: {args}") + return None + if __name__ == "__main__": op, args = scraper_args() @@ -138,10 +191,10 @@ def scene_from_url(url: str) -> ScrapedScene: # result = group_from_url(url) case "scene-by-url", {"url": url} if url: result = scene_from_url(url) - # case "scene-by-name", {"name": name} if name: - # result = scene_search(name) - # case "scene-by-fragment" | "scene-by-query-fragment", args: - # result = scene_from_fragment(args) + case "scene-by-name", {"name": name} if name: + result = scene_search(name) + case "scene-by-fragment" | "scene-by-query-fragment", args: + result = scene_from_fragment(args) # case "performer-by-url", {"url": url}: # result = performer_from_url(url) # case "performer-by-fragment", args: diff --git a/scrapers/DreamTranny/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml index d37ccc390..e76ebf5a0 100644 --- a/scrapers/DreamTranny/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -1,13 +1,17 @@ # yaml-language-server: $schema=../../validator/scraper.schema.json name: DreamTranny sceneByName: - action: scrapeXPath - queryURL: https://dreamtranny.com/updates/?q={} - scraper: sceneListScraper + action: script + script: + - python + - DreamTranny.py + - scene-by-name sceneByQueryFragment: - action: scrapeXPath - queryURL: "{url}" - scraper: sceneScraper + action: script + script: + - python + - DreamTranny.py + - scene-by-query-fragment sceneByURL: - action: script url: From bc41b7b9211c72d94fda25c164812b5f868f9784 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 14:20:25 +0100 Subject: [PATCH 04/10] implement gallery-by-url and gallery-by-fragment Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 56 +++++++++++++++++++++++++--- scrapers/DreamTranny/DreamTranny.yml | 20 ++++++++-- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index eae2bb97e..1bd5f3e6f 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -160,7 +160,7 @@ def scene_search(name: str) -> list[ScrapedScene]: return scenes def scene_from_fragment(args: dict[str, Any]) -> list[ScrapedScene]: - # if url is provided, extract scene ID and call scene_from_url + # if url is provided, call scene_from_url if url := args.get("url"): log.debug(f"Extracting scene from URL fragment: {url}") return scene_from_url(url) @@ -178,15 +178,61 @@ def scene_from_fragment(args: dict[str, Any]) -> list[ScrapedScene]: log.error(f"No valid fragment provided in arguments: {args}") return None +def gallery_from_url(url: str) -> ScrapedGallery: + # reuse scene_from_url to get gallery info since the page structure is the same + if scene := scene_from_url(url): + gallery = ScrapedGallery( + title=scene.get("title", ""), + details=scene.get("details"), + date=scene.get("date"), + tags=scene.get("tags", []), + performers=scene.get("performers", []), + studio=scene.get("studio"), + url=scene.get("url", ""), + code=scene.get("code", ""), + ) + return gallery + return None + +def gallery_from_fragment(args: dict[str, Any]) -> ScrapedGallery: + # if url is provided, call gallery_from_url + if url := args.get("url"): + log.debug(f"Extracting gallery from URL fragment: {url}") + return gallery_from_url(url) + + # if name is provided, call scene_search and convert first result to gallery + if name := args.get("name"): + log.debug(f"Searching for gallery by name fragment: {name}") + if search_results := scene_search(name): + log.debug(f"Found {len(search_results)} search results for name: {name}") + first_result = search_results[0] + gallery = ScrapedGallery( + title=first_result.get("title", ""), + details=first_result.get("details"), + date=first_result.get("date"), + tags=first_result.get("tags", []), + performers=first_result.get("performers", []), + studio=first_result.get("studio"), + url=first_result.get("url", ""), + code=first_result.get("code", ""), + ) + return gallery + else: + log.warning(f"No search results found for name: {name}") + return None + + log.error(f"No valid fragment provided for gallery extraction in arguments: {args}") + return None + if __name__ == "__main__": op, args = scraper_args() log.debug(f"args: {args}") match op, args: - # case "gallery-by-url", {"url": url} if url: - # result = gallery_from_url(url) - # case "gallery-by-fragment", args: - # result = gallery_from_fragment(args) + case "gallery-by-url", {"url": url} if url: + result = gallery_from_url(url) + case "gallery-by-fragment", args: + result = gallery_from_fragment(args) # case "group-by-url", {"url": url} if url: # result = group_from_url(url) case "scene-by-url", {"url": url} if url: diff --git a/scrapers/DreamTranny/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml index e76ebf5a0..f7f2a5f6a 100644 --- a/scrapers/DreamTranny/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -20,16 +20,28 @@ sceneByURL: - python - DreamTranny.py - scene-by-url +galleryByFragment: + action: script + script: + - python + - DreamTranny.py + - gallery-by-fragment galleryByURL: - - action: scrapeXPath + - action: script url: - dreamtranny.com - scraper: galleryScraper + script: + - python + - DreamTranny.py + - gallery-by-url performerByURL: - - action: scrapeXPath + - action: script url: - dreamtranny.com/models/ - scraper: performerScraper + script: + - python + - DreamTranny.py + - performer-by-url xPathScrapers: sceneScraper: scene: From 3a979b9f1dae8314ce8e50f41ddf636d5e351ce5 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 14:38:24 +0100 Subject: [PATCH 05/10] implement performer-by-url Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 49 +++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index 1bd5f3e6f..5ae2924a6 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -224,6 +224,51 @@ def gallery_from_fragment(args: dict[str, Any]) -> ScrapedGallery: log.error(f"No valid fragment provided for gallery extraction in arguments: {args}") return None +def performer_from_url(url: str) -> ScrapedPerformer: + performer = ScrapedPerformer(gender="TRANSGENDER_FEMALE") + sess = requests.Session() + res = sess.get(url) + res.raise_for_status() + soup = bs(res.text, "html.parser") + + # name from xpath //h1[@class="model-title"]/text() + if name_elem := soup.select_one("h1.model-title"): + performer["name"] = name_elem.get_text(strip=True) + else: + log.warning("Performer name not found") + + # image from xpath //div[@class="model-img"]/a/img[@class="img"]/@src + if img_elem := soup.select_one("div.model-img a img.img"): + if img_url := img_elem.get("src"): + performer["image"] = url_to_absolute(url, img_url) + else: + log.warning("Performer image not found") + + if model_content := soup.select_one('div.model-content'): + for p_tag in model_content.find_all('p'): + spans = p_tag.find_all('span') + for i, span in enumerate(spans): + # country from relative xpath /p/span[text()="NATIONALITY"]/following-sibling::span[1] + if span.get_text(strip=True) == "NATIONALITY" and i + 1 < len(spans): + country = spans[i + 1].get_text(strip=True) + performer["country"] = guess_nationality(country) + # birthdate from relative xpath /p/span[text()="DATE OF BIRTH"]/following-sibling::span[1] + if span.get_text(strip=True) == "DATE OF BIRTH" and i + 1 < len(spans): + birthdate_elem = spans[i + 1] + birthdate = birthdate_elem.get_text(strip=True) + # remove ordinal suffixes from birthdate (e.g. "2nd" -> "2") + birthdate = re.sub(r"(\d{1,2})(st|nd|rd|th)", r"\1", birthdate) + # convert birthdate from format "January 2, 2006" to "2006-01-02" + try: + birthdate_obj = datetime.strptime(birthdate, "%B %d, %Y") + performer["birthdate"] = birthdate_obj.strftime("%Y-%m-%d") + except Exception as ex: + log.warning(f"Error parsing performer birthdate: {ex}") + else: + log.warning("Performer model-content not found") + + return performer + if __name__ == "__main__": op, args = scraper_args() @@ -241,8 +286,8 @@ def gallery_from_fragment(args: dict[str, Any]) -> ScrapedGallery: result = scene_search(name) case "scene-by-fragment" | "scene-by-query-fragment", args: result = scene_from_fragment(args) - # case "performer-by-url", {"url": url}: - # result = performer_from_url(url) + case "performer-by-url", {"url": url}: + result = performer_from_url(url) # case "performer-by-fragment", args: # result = performer_from_fragment(args) # case "performer-by-name", {"name": name} if name: From db18c5d77a97ac92c58b08214df487af5a46a672 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 16:03:41 +0100 Subject: [PATCH 06/10] implement performer-by-name and performer-by-fragment Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 61 +++++++++++++++++++++++----- scrapers/DreamTranny/DreamTranny.yml | 12 ++++++ 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index 5ae2924a6..24a19059f 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -1,7 +1,5 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime import json -import os import re import sys from typing import Any @@ -9,8 +7,8 @@ from py_common.deps import ensure_requirements ensure_requirements("bs4:beautifulsoup4", "requests") import py_common.log as log -from py_common.types import ScrapedGallery, ScrapedGroup, ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag -from py_common.util import dig, guess_nationality, scraper_args +from py_common.types import ScrapedGallery, ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag +from py_common.util import guess_nationality, scraper_args import requests from bs4 import BeautifulSoup as bs, Tag @@ -18,7 +16,6 @@ STUDIO = ScrapedStudio(name="Dream Tranny", url="https://www.dreamtranny.com") def url_to_absolute(base_url: str, url: str) -> str: - log.debug(f"Converting URL to absolute: base_url={base_url}, relative_url={url}") if url.startswith("http"): log.debug(f"URL is already absolute: {url}") return url @@ -41,8 +38,9 @@ def scrape_performers(performer_links): p_name = el.get_text(strip=True) p_url = el.get("href") if p_name and p_url: - log.debug(f"Found performer: name={p_name}, url={p_url}") - performers.append(ScrapedPerformer(name=p_name, url=p_url)) + p_abs_url = url_to_absolute(STUDIO["url"], p_url) + log.debug(f"Found performer: name={p_name}, url={p_abs_url}") + performers.append(ScrapedPerformer(name=p_name, url=p_abs_url)) return performers def scene_from_url(url: str) -> ScrapedScene: @@ -231,6 +229,9 @@ def performer_from_url(url: str) -> ScrapedPerformer: res.raise_for_status() soup = bs(res.text, "html.parser") + # remove query parameters from URL and store as performer URL + performer["url"] = url.split("?")[0] + # name from xpath //h1[@class="model-title"]/text() if name_elem := soup.select_one("h1.model-title"): performer["name"] = name_elem.get_text(strip=True) @@ -249,6 +250,7 @@ def performer_from_url(url: str) -> ScrapedPerformer: spans = p_tag.find_all('span') for i, span in enumerate(spans): # country from relative xpath /p/span[text()="NATIONALITY"]/following-sibling::span[1] + # the text values on the site are inconsistent, sometimes it can be "Brazil", other times "Brazilian" if span.get_text(strip=True) == "NATIONALITY" and i + 1 < len(spans): country = spans[i + 1].get_text(strip=True) performer["country"] = guess_nationality(country) @@ -269,6 +271,43 @@ def performer_from_url(url: str) -> ScrapedPerformer: return performer +def performer_search(name: str) -> list[ScrapedPerformer]: + # the scene search results also include performer links, so we can reuse scene_search to find performers by name + if search_results := scene_search(name): + performers = [] + for scene in search_results: + if "performers" in scene: + performers.extend(scene["performers"]) + log.debug(f"Extracted {len(performers)} performers from search results for name: {name}") + # deduplicate performers by name and url + unique_performers = {} + for performer in performers: + key = (performer.get("name"), performer.get("url")) + if key not in unique_performers: + unique_performers[key] = performer + log.debug(f"Deduplicated performers to {len(unique_performers)} unique entries for name: {name}") + # only return performers that contain the search name (case-insensitive) + filtered_performers = [p for p in unique_performers.values() if name.lower() in p.get("name", "").lower()] + log.debug(f"Found {len(filtered_performers)} performers matching search name: {name}") + return filtered_performers + else: + log.warning(f"No search results found for performer name: {name}") + return [] + +def performer_from_fragment(args: dict[str, Any]) -> list[ScrapedPerformer]: + # if url is provided, call performer_from_url + if url := args.get("url"): + log.debug(f"Extracting performer from URL fragment: {url}") + return performer_from_url(url) + + # if name is provided, call performer_search and return results + if name := args.get("name"): + log.debug(f"Searching for performer by name fragment: {name}") + return performer_search(name) + + log.error(f"No valid fragment provided for performer extraction in arguments: {args}") + return [] + if __name__ == "__main__": op, args = scraper_args() @@ -288,10 +327,10 @@ def performer_from_url(url: str) -> ScrapedPerformer: result = scene_from_fragment(args) case "performer-by-url", {"url": url}: result = performer_from_url(url) - # case "performer-by-fragment", args: - # result = performer_from_fragment(args) - # case "performer-by-name", {"name": name} if name: - # result = performer_search(name) + case "performer-by-fragment", args: + result = performer_from_fragment(args) + case "performer-by-name", {"name": name} if name: + result = performer_search(name) case _: log.error(f"Operation: {op}, arguments: {json.dumps(args)}") sys.exit(1) diff --git a/scrapers/DreamTranny/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml index f7f2a5f6a..d4796ef26 100644 --- a/scrapers/DreamTranny/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -34,6 +34,18 @@ galleryByURL: - python - DreamTranny.py - gallery-by-url +performerByName: + action: script + script: + - python + - DreamTranny.py + - performer-by-name +performerByFragment: + action: script + script: + - python + - DreamTranny.py + - performer-by-fragment performerByURL: - action: script url: From b5c73aa61a7be7432fd1cfbd5e136529fac045cc Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 16:04:22 +0100 Subject: [PATCH 07/10] remove old xpath scrapers --- scrapers/DreamTranny/DreamTranny.yml | 99 ---------------------------- 1 file changed, 99 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml index d4796ef26..03f4db322 100644 --- a/scrapers/DreamTranny/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -54,103 +54,4 @@ performerByURL: - python - DreamTranny.py - performer-by-url -xPathScrapers: - sceneScraper: - scene: - Title: &titleSel //div[@class="section-title"]/h4/text() - Details: &detailsSel //p[@class="read-more"]/text() - Date: &dateAttr - selector: //small[@class="updated-at"]/text() - postProcess: - - parseDate: Jan 2, 2006 - Performers: &performersAttr - Name: //a[contains(@class, "model-name")]/text() - Image: - selector: >- - //video[contains(@class,"video-js")]/@poster - | - //div[contains(@class,"model-player")]//img/@src - | - //video[contains(@class,"vjs")]/@poster - postProcess: - - replace: - - regex: ^(/.*) - with: https://dreamtranny.com$1 - Studio: &studioAttr - Name: - fixed: Dream Tranny - Tags: &tagsAttr - Name: //div[@class="model-categories"]/a/text() - URLs: &urlAttr - selector: &urlSel //script[contains(.,"API_VIEW_URLS")]/text() - postProcess: &sceneURLPostProcess - - replace: - - regex: .*/api(/update/\d+)/view_count.* - with: "https://dreamtranny.com$1/" - # URLs: - # selector: *urlSel - # postProcess: *sceneURLPostProcess - Code: - selector: *urlSel - postProcess: - - replace: - - regex: .*/api/update/(\d+)/view_count.* - with: "$1" - sceneListScraper: - common: - $videoItemContent: //div[contains(@class, "video-item")]//div[@class="item-content"] - $performerLink: //div[contains(@class, "video-item")]//div[@class="item-content"]//a[contains(@href, "/models/")] - $videoLink: //div[contains(@class, "video-item")]//div[@class="item-content"]//a[contains(@href, "/update/")] - scene: - Title: $videoLink/text() - Image: //div[contains(@class, "video-item")]//a[@class="item-thumb"]/img/@src - Date: - selector: $videoItemContent/div[2]/p - postProcess: - - parseDate: Jan 2, 2006 - URL: - selector: $videoLink/@href - postProcess: &urlPostProcess - - replace: - - regex: ^(/.*) - with: https://dreamtranny.com$1 - Studio: *studioAttr - # The following are not available on the listing page, so we will grab them from the scene page using the scene URL - Details: - selector: $videoLink/@href - postProcess: - - replace: - - regex: ^(/.*) - with: https://dreamtranny.com$1 - - subScraper: - selector: *detailsSel - galleryScraper: - gallery: - Title: *titleSel - Details: *detailsSel - Date: *dateAttr - Tags: *tagsAttr - Performers: *performersAttr - Studio: *studioAttr - URL: *urlAttr - performerScraper: - performer: - Name: //h1[@class="model-title"]/text() - Gender: - fixed: transgender_female - Image: - selector: //div[@class="model-img"]/a/img[@class="img"]/@src - postProcess: *urlPostProcess - Country: //div[@class="model-content"]/p/span[text()="NATIONALITY"]/following-sibling::span[1] - Birthdate: - selector: //div[@class="model-content"]/p/span[text()="DATE OF BIRTH"]/following-sibling::span[1] - postProcess: - - replace: - - regex: (\d)(st|[nr]d|th) - with: "$1" - - parseDate: January 2, 2006 -driver: - # site uses age verification for regions including UK and US - # using CDP (set to true) with a VPN configured to another region, e.g. NL, will bypass this - useCDP: false # Last Updated April 30, 2026 From 1a39058ccde00431d3d98dc93b6aaecce4b3949e Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 16:08:00 +0100 Subject: [PATCH 08/10] add sceneByFragment to YAML --- scrapers/DreamTranny/DreamTranny.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scrapers/DreamTranny/DreamTranny.yml b/scrapers/DreamTranny/DreamTranny.yml index 03f4db322..35c47a359 100644 --- a/scrapers/DreamTranny/DreamTranny.yml +++ b/scrapers/DreamTranny/DreamTranny.yml @@ -6,6 +6,12 @@ sceneByName: - python - DreamTranny.py - scene-by-name +sceneByFragment: + action: script + script: + - python + - DreamTranny.py + - scene-by-fragment sceneByQueryFragment: action: script script: From 1c084ff1260179640d2cd5d8531e89eea3604d09 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 16:27:24 +0100 Subject: [PATCH 09/10] use fragment title Co-authored-by: Copilot --- scrapers/DreamTranny/DreamTranny.py | 45 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index 24a19059f..52caac59e 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -157,20 +157,20 @@ def scene_search(name: str) -> list[ScrapedScene]: return scenes -def scene_from_fragment(args: dict[str, Any]) -> list[ScrapedScene]: +def scene_from_fragment(args: dict[str, Any]) -> ScrapedScene: # if url is provided, call scene_from_url if url := args.get("url"): log.debug(f"Extracting scene from URL fragment: {url}") return scene_from_url(url) - # if name is provided, call scene_search - if name := args.get("name"): - log.debug(f"Searching for scene by name fragment: {name}") - if search_results := scene_search(name): - log.debug(f"Found {len(search_results)} search results for name: {name}") - return search_results[0] + # if title is provided, call scene_search + if title := args.get("title"): + log.debug(f"Searching for scene by title fragment: {title}") + if search_results := scene_search(title): + log.debug(f"Found {len(search_results)} search results for title: {title}") + return scene_from_url(search_results[0].get("url")) else: - log.warning(f"No search results found for name: {name}") + log.warning(f"No search results found for title: {title}") return None log.error(f"No valid fragment provided in arguments: {args}") @@ -199,24 +199,25 @@ def gallery_from_fragment(args: dict[str, Any]) -> ScrapedGallery: return gallery_from_url(url) # if name is provided, call scene_search and convert first result to gallery - if name := args.get("name"): - log.debug(f"Searching for gallery by name fragment: {name}") - if search_results := scene_search(name): - log.debug(f"Found {len(search_results)} search results for name: {name}") - first_result = search_results[0] + if title := args.get("title"): + log.debug(f"Searching for gallery by title fragment: {title}") + args["name"] = title + # reuse scene_from_fragment to find the scene and then convert it to a gallery since the gallery is on the scene page + if scene := scene_from_fragment(args): + log.debug(f"Found scene for title: {title}") gallery = ScrapedGallery( - title=first_result.get("title", ""), - details=first_result.get("details"), - date=first_result.get("date"), - tags=first_result.get("tags", []), - performers=first_result.get("performers", []), - studio=first_result.get("studio"), - url=first_result.get("url", ""), - code=first_result.get("code", ""), + title=scene.get("title", ""), + details=scene.get("details"), + date=scene.get("date"), + tags=scene.get("tags", []), + performers=scene.get("performers", []), + studio=scene.get("studio"), + url=scene.get("url", ""), + code=scene.get("code", ""), ) return gallery else: - log.warning(f"No search results found for name: {name}") + log.warning(f"No scene found for title: {title}") return None log.error(f"No valid fragment provided for gallery extraction in arguments: {args}") From 0d825d10299ff0fe6cd7d0beeea42903d1bfab5c Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 30 Apr 2026 16:29:51 +0100 Subject: [PATCH 10/10] add newline at EOF --- scrapers/DreamTranny/DreamTranny.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/DreamTranny/DreamTranny.py b/scrapers/DreamTranny/DreamTranny.py index 52caac59e..067acccb0 100644 --- a/scrapers/DreamTranny/DreamTranny.py +++ b/scrapers/DreamTranny/DreamTranny.py @@ -336,4 +336,4 @@ def performer_from_fragment(args: dict[str, Any]) -> list[ScrapedPerformer]: log.error(f"Operation: {op}, arguments: {json.dumps(args)}") sys.exit(1) - print(json.dumps(result)) \ No newline at end of file + print(json.dumps(result))