diff --git a/pkg/api/trailers.go b/pkg/api/trailers.go index bef45fb06..17f7235ab 100644 --- a/pkg/api/trailers.go +++ b/pkg/api/trailers.go @@ -156,7 +156,8 @@ func extractFromJson(inputJson string, params models.TrailerScrape, srcs []model if params.RecordPath != "" { u := gjson.Get(JsonMetadata, params.RecordPath) u.ForEach(func(key, value gjson.Result) bool { - url := gjson.Get(value.String(), params.ContentPath).String() + url := params.ContentBaseUrl + url += gjson.Get(value.String(), params.ContentPath).String() quality := gjson.Get(value.String(), params.QualityPath).String() encoding := "" if params.EncodingPath != "" { diff --git a/pkg/models/model_external_reference.go b/pkg/models/model_external_reference.go index 46e7e6747..9c779a3c3 100644 --- a/pkg/models/model_external_reference.go +++ b/pkg/models/model_external_reference.go @@ -597,20 +597,18 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() { siteDetails = GenericScraperRuleSet{} siteDetails.Domain = "vrhush.com" - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `img[id="model-thumbnail"]`, ResultType: "attr", Attribute: "src", PostProcessing: []PostProcessing{{Function: "AbsoluteUrl"}}}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "biography", Selector: `div[id="model-info-block"] p`}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "ethnicity", Selector: `ul.model-attributes li:contains("Ethnicity")`, PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`Ethnicity (.*)`, "1"}}}}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "eye_color", Selector: `ul.model-attributes li:contains("Eye Color")`, PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`Eye Color (.*)`, "1"}}}}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{ - XbvrField: "height", Selector: `ul.model-attributes li:contains("Height")`, - PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`^(Height )(.+)`, "2"}}, {Function: "Feet+Inches to cm", Params: []string{`(\d+)\'(\d+)\"`, "1", "2"}}}, - }) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "gender", Selector: `ul.model-attributes li:contains("Gender")`, PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`Gender (.*)`, "1"}}}}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "hair_color", Selector: `ul.model-attributes li:contains("Hair Color")`, PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`Hair Color (.*)`, "1"}}}}) - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{ - XbvrField: "weight", Selector: `ul.model-attributes li:contains("Weight")`, - PostProcessing: []PostProcessing{{Function: "RegexString", Params: []string{`^(Weight )(.+)`, "2"}}, {Function: "lbs to kg"}}, - }) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "gender", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{{Function: "jsonString", Params: []string{"props.pageProps.model.gender"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `.thumbnail img`, ResultType: "attr", Attribute: "src", PostProcessing: []PostProcessing{{Function: "AbsoluteUrl"}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "biography", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{{Function: "jsonString", Params: []string{"props.pageProps.model.Bio"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "eye_color", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{{Function: "jsonString", Params: []string{"props.pageProps.model.eyes"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "hair_color", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{{Function: "jsonString", Params: []string{"props.pageProps.model.hair"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "ethnicity", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{{Function: "jsonString", Params: []string{"props.pageProps.model.race"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "height", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{ + {Function: "jsonString", Params: []string{"props.pageProps.model.height"}}, + {Function: "Feet+Inches to cm", Params: []string{`(\d+)\'(\d+)\"`, "1", "2"}}}}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "weight", Selector: `script[Id="__NEXT_DATA__"]`, PostProcessing: []PostProcessing{ + {Function: "jsonString", Params: []string{"props.pageProps.model.weight"}}, + {Function: "lbs to kg"}}}) scrapeRules.GenericActorScrapingConfig["vrhush scrape"] = siteDetails siteDetails.Domain = "vrallure.com" @@ -1177,6 +1175,13 @@ func (scrapeRules ActorScraperConfig) getSiteUrlMatchingRules() { StashId: "2059fbf9-94fe-4986-8565-2a7cc199636a", Rules: []SceneMatchRule{{XbvrField: "scene_url", XbvrMatch: `(realjamvr.com)(.*)\/(\d*-?)([^\/]+)\/?$`, XbvrMatchResultPosition: 4, StashRule: `(realjamvr.com)(.*)\/(\d*-?)([^\/]+)\/?$`, StashMatchResultPosition: 4}}, } + scrapeRules.StashSceneMatching["vrhush"] = StashSiteConfig{ + StashId: "c85a3d13-c1b9-48d0-986e-3bfceaf0afe5", + // ignores optional /vrh999_ from old urls + Rules: []SceneMatchRule{{XbvrField: "scene_url", XbvrMatch: `\/([^\/]+)$`, XbvrMatchResultPosition: 4, StashRule: `\/((vrh\d+)_)?([^\/?]+)(?:\?.*)?$`, StashMatchResultPosition: 3}, // handle trailing query params + {XbvrField: "scene_url", XbvrMatch: `\/([^\/]+)$`, XbvrMatchResultPosition: 4, StashRule: `\/((vrh\d+)_)?([^\/?]+)(?:_180.*)?$`, StashMatchResultPosition: 3}, // handle _180 suffix now gone from urls + }, + } scrapeRules.StashSceneMatching["sexbabesvr"] = StashSiteConfig{ StashId: "b80d419c-4a81-44c9-ae79-d9614dd30351", Rules: []SceneMatchRule{{XbvrField: "scene_url", XbvrMatch: `(sexbabesvr.com)(.*)\/([^\/]+)\/?$`, XbvrMatchResultPosition: 3, StashRule: `(sexbabesvr.com)(.*)\/([^\/]+)\/?$`, StashMatchResultPosition: 3}}, diff --git a/pkg/scrape/genericactorscraper.go b/pkg/scrape/genericactorscraper.go index fcd88d4f1..4b723ff3e 100644 --- a/pkg/scrape/genericactorscraper.go +++ b/pkg/scrape/genericactorscraper.go @@ -81,7 +81,7 @@ func GenericActorScrapers() { db.Raw(sqlcmd).Scan(&output) var wg sync.WaitGroup - concurrentLimit := 20 // Maximum number of concurrent tasks + concurrentLimit := 10 // Maximum number of concurrent tasks semaphore = make(chan struct{}, concurrentLimit) actorSemMap := make(map[uint]chan struct{}) diff --git a/pkg/scrape/vrhush.go b/pkg/scrape/vrhush.go index c79441e5d..a082be3ec 100644 --- a/pkg/scrape/vrhush.go +++ b/pkg/scrape/vrhush.go @@ -2,8 +2,8 @@ package scrape import ( "encoding/json" - "net/url" - "regexp" + "fmt" + "strconv" "strings" "sync" @@ -22,8 +22,7 @@ func VRHush(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan< sceneCollector := createCollector("vrhush.com") siteCollector := createCollector("vrhush.com") - castCollector := createCollector("vrhush.com") - castCollector.AllowURLRevisit = true + pageCnt := 1 sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { sc := models.ScrapedScene{} @@ -34,108 +33,85 @@ func VRHush(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan< sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] sc.MembersUrl = strings.Replace(sc.HomepageURL, "https://vrhush.com/scenes/", "https://ma.vrhush.com/scene/", 1) - // Scene ID - get from URL - tmp := strings.Split(sc.HomepageURL, "/") - tmp2 := strings.Split(tmp[len(tmp)-1], "_")[0] - sc.SiteID = strings.Replace(tmp2, "vrh", "", -1) - sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID + // get json data + var jsonResult map[string]interface{} + e.ForEach(`script[Id="__NEXT_DATA__"]`, func(id int, e *colly.HTMLElement) { + json.Unmarshal([]byte(e.Text), &jsonResult) + }) + jsonResult = jsonResult["props"].(map[string]interface{}) + jsonResult = jsonResult["pageProps"].(map[string]interface{}) + content := jsonResult["content"].(map[string]interface{}) - // Regex for original resolution of gallery - reGetOriginal := regexp.MustCompile(`^(https?:\/\/b8h6h9v9\.ssl\.hwcdn\.net\/vrh\/)(?:largethumbs|hugethumbs|rollover_large|rollover_huge)(\/.+)-c\d{3,4}x\d{3,4}(\.\w{3,4})$`) + // Scene ID - get from json scene code (url no longer has the code) + tmp := strings.Split(content["scene_code"].(string), "_")[0] + sc.SiteID = strings.Replace(tmp, "vrh", "", -1) + sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID // Title / Cover - e.ForEach(`.latest-scene-title`, func(id int, e *colly.HTMLElement) { - sc.Title = strings.TrimSpace(e.Text) - }) - e.ForEach(`web-vr-video-player`, func(id int, e *colly.HTMLElement) { - sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(e.Attr("coverimage"))) - }) - - // Gallery - // note 'rollover_large' could be changed to 'rollover_huge' for HQ original but those are easily 5Mb+ - e.ForEach(`div.owl-carousel img.img-responsive`, func(id int, e *colly.HTMLElement) { - tmpParts := reGetOriginal.FindStringSubmatch(e.Request.AbsoluteURL(e.Attr("src"))) - if len(tmpParts) > 3 { - sc.Gallery = append(sc.Gallery, tmpParts[1]+"rollover_large"+tmpParts[2]+tmpParts[3]) - } - }) + sc.Title = content["title"].(string) + sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(content["trailer_screencap"].(string))) // Synopsis - e.ForEach(`span.full-description`, func(id int, e *colly.HTMLElement) { - sc.Synopsis = strings.TrimSpace(e.Text) - }) + sc.Synopsis = content["description"].(string) // Tags - e.ForEach(`p.tag-container a.label-tag`, func(id int, e *colly.HTMLElement) { - sc.Tags = append(sc.Tags, strings.TrimSpace(e.Text)) - }) + tagList := content["tags"].([]interface{}) + for _, tag := range tagList { + sc.Tags = append(sc.Tags, tag.(string)) + } // Cast sc.ActorDetails = make(map[string]models.ActorDetails) - var tmpCast []string - e.ForEach(`h5.latest-scene-subtitle a`, func(id int, e *colly.HTMLElement) { - tmpCast = append(tmpCast, e.Attr("href")) - sc.ActorDetails[e.Text] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: e.Attr("href")} - }) - - // Date - e.ForEach(`div.latest-scene-meta-1 div.text-left`, func(id int, e *colly.HTMLElement) { - tmpDate, _ := goment.New(e.Text, "MMM DD, YYYY") - sc.Released = tmpDate.Format("YYYY-MM-DD") - }) - - // Duration - sc.Duration = 0 + modelList := jsonResult["models"].([]interface{}) + for _, model := range modelList { + modelMap, _ := model.(map[string]interface{}) + if modelMap["gender"] == "Female" { + sc.Cast = append(sc.Cast, modelMap["name"].(string)) + sc.ActorDetails[modelMap["name"].(string)] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: "https://vrhush.com/models/" + modelMap["slug"].(string)} + } + } + // Date & duration + tmpDate, _ := goment.New(content["publish_date"].(string), "YYYY/MM/DD") + sc.Released = tmpDate.Format("YYYY-MM-DD") + dur_str := content["videos_duration"].(string) + if dur_str != "" { + num, _ := strconv.ParseFloat(dur_str, 64) + sc.Duration = int(num / 60) + } // trailer details - sc.TrailerType = "scrape_html" - params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "web-vr-video-player source", ContentPath: "src", QualityPath: "quality", ContentBaseUrl: "https:"} - strParams, _ := json.Marshal(params) - sc.TrailerSrc = string(strParams) - - // Filenames - e.ForEach(`input.stream-input-box`, func(id int, e *colly.HTMLElement) { - origURL, _ := url.Parse(e.Attr("value")) - sc.Filenames = append(sc.Filenames, origURL.Query().Get("name")) - }) - ctx := colly.NewContext() - ctx.Put("scene", &sc) + sc.TrailerType = "scrape_json" + var t models.TrailerScrape + t.SceneUrl = sc.HomepageURL + t.HtmlElement = `script[id="__NEXT_DATA__"]` + t.RecordPath = "props.pageProps.content.trailers" + t.ContentPath = "url" + t.QualityPath = "label" + t.ContentBaseUrl = "https:" + tmpjson, _ := json.Marshal(t) + sc.TrailerSrc = string(tmpjson) - for i := range tmpCast { - castCollector.Request("GET", tmpCast[i], nil, ctx, nil) + // Filenames + videolList := content["videos"].(map[string]interface{}) + for _, video := range videolList { + videoMap, _ := video.(map[string]interface{}) + tmp := strings.Split(videoMap["file"].(string), "/") + sc.Filenames = append(sc.Filenames, tmp[len(tmp)-1]) } out <- sc }) - castCollector.OnHTML(`html`, func(e *colly.HTMLElement) { - sc := e.Request.Ctx.GetAny("scene").(*models.ScrapedScene) - - var name string - reDoubleWhitespace := regexp.MustCompile(`[\s\p{Zs}]{2,}`) - e.ForEach(`h1#model-name`, func(id int, e *colly.HTMLElement) { - name = strings.TrimSpace(reDoubleWhitespace.ReplaceAllString(e.Text, " ")) - }) - - var gender string - e.ForEach(`ul.model-attributes li`, func(id int, e *colly.HTMLElement) { - if strings.Split(e.Text, " ")[0] == "Gender" { - gender = strings.Split(e.Text, " ")[1] - } - }) - - if gender == "Female" { - sc.Cast = append(sc.Cast, name) + siteCollector.OnHTML(`ul.pagination li`, func(e *colly.HTMLElement) { + if strings.Contains(e.Attr("class"), "next") && !strings.Contains(e.Attr("class"), "disabled") { + pageCnt += 1 + pageURL := e.Request.AbsoluteURL(`https://vrhush.com/scenes?page=` + fmt.Sprint(pageCnt) + `&order_by=publish_date&sort_by=desc`) + siteCollector.Visit(pageURL) } }) - siteCollector.OnHTML(`ul.pagination a`, func(e *colly.HTMLElement) { - pageURL := e.Request.AbsoluteURL(e.Attr("href")) - siteCollector.Visit(pageURL) - }) - - siteCollector.OnHTML(`div.row div.col-md-4 p.desc a`, func(e *colly.HTMLElement) { + siteCollector.OnHTML(`div.contentThumb__info__title A`, func(e *colly.HTMLElement) { sceneURL := e.Request.AbsoluteURL(e.Attr("href")) // If scene exist in database, there's no need to scrape @@ -147,7 +123,7 @@ func VRHush(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan< if singleSceneURL != "" { sceneCollector.Visit(singleSceneURL) } else { - siteCollector.Visit("https://vrhush.com/scenes") + siteCollector.Visit("https://vrhush.com/scenes?page=1&order_by=publish_date&sort_by=desc") } if updateSite {