diff --git a/scrapers/JavGuru.yml b/scrapers/JavGuru.yml new file mode 100644 index 000000000..4147116f2 --- /dev/null +++ b/scrapers/JavGuru.yml @@ -0,0 +1,116 @@ +# JavGuru +# Scene by URL / search, Performer (actress) by URL. +# Scene pages: https://jav.guru/// +# Actress pages: https://jav.guru/actress// +name: JavGuru + +# ---- Scene ------------------------------------------------------------- +sceneByURL: + - action: scrapeXPath + url: + - jav.guru/ + scraper: sceneScraper + +# "Name" search box on the Scene edit page +sceneByName: + action: scrapeXPath + queryURL: https://jav.guru/?s={} + scraper: sceneSearch + +# Scrapes the full scene once a search result is chosen +sceneByQueryFragment: + action: scrapeXPath + queryURL: "{url}" + scraper: sceneScraper + +# ---- Performer (actress) ---------------------------------------------- +performerByURL: + - action: scrapeXPath + url: + - jav.guru/actress/ + scraper: performerScraper + +xPathScrapers: + # Full scene page + sceneScraper: + common: + $info: //div[@class="infoleft"] + scene: + Title: + selector: //h1[@class="titl"]/text() + postProcess: + # drop the leading "[CODE] " prefix; Code is captured separately + - replace: + - regex: ^\s*\[[^\]]*\]\s* + with: "" + Details: + # jav.guru scenes have no synopsis; mirror the cleaned title + selector: //h1[@class="titl"]/text() + postProcess: + - replace: + - regex: ^\s*\[[^\]]*\]\s* + with: "" + URL: + fixed: "{inputURL}" + Code: $info//li[strong/span[contains(text(),"Code")]]/text() + Date: + selector: $info//li[strong/span[contains(text(),"Release Date")]]/text() + postProcess: + - parseDate: 2006-01-02 + Image: + selector: //div[@class="large-screenimg"]/img/@src + postProcess: + # cdn.javmiku.com is Cloudflare-challenged for server-side fetches; + # cdn.javnorth.com mirrors the same paths and is reachable. + - replace: + - regex: cdn\.javmiku\.com + with: cdn.javnorth.com + Director: $info//li[strong[contains(text(),"Director")]]/a + Studio: + Name: $info//li[strong[contains(text(),"Studio")]]/a + URL: $info//li[strong[contains(text(),"Studio")]]/a/@href + Tags: + Name: $info//li[strong[contains(text(),"Tags")]]/a + Performers: + Name: $info//li[strong[contains(text(),"Actress")]]/a + URLs: $info//li[strong[contains(text(),"Actress")]]/a/@href + + # Search results page (one or more cards) + sceneSearch: + common: + $item: //div[@class="inside-article"][div[@class="imgg"]] + scene: + Title: + selector: $item//div[@class="grid1"]/h2/a/text() + postProcess: + - replace: + - regex: ^\s*\[[^\]]*\]\s* + with: "" + URL: $item//div[@class="imgg"]/a/@href + Image: + selector: $item//div[@class="imgg"]/a/img/@src + postProcess: + - replace: + - regex: cdn\.javmiku\.com + with: cdn.javnorth.com + + performerScraper: + performer: + Name: //h1[@class="profile-h1"]/text() + Image: + selector: //img[@class="cp-avatar"]/@src + postProcess: + - replace: + - regex: cdn\.javmiku\.com + with: cdn.javnorth.com + URLs: + fixed: "{inputURL}" + Country: + fixed: Japan + +# jav.guru rejects requests without a normal browser UA +driver: + headers: + - Key: User-Agent + Value: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36 +# Last Updated June 14, 2026