diff --git a/configs/gleaner_opencore b/configs/gleaner_opencore new file mode 100644 index 00000000..a4ffbe1e --- /dev/null +++ b/configs/gleaner_opencore @@ -0,0 +1,45 @@ +context: + cache: true +contextmaps: + - file: ./assets/schemaorg-current-https.jsonld + prefix: https://schema.org/ + - file: ./assets/schemaorg-current-http.jsonld + prefix: http://schema.org/ +gleaner: + mill: true + runid: runX + summon: true +millers: + graph: true +minio: + address: oss.geocodes-dev.earthcube.org + port: 443 + ssl: true + accesskey: worldsbestaccesskey + secretkey: worldsbestsecretkey + bucket: opencore +sources: + - sourcetype: sitemap + name: opencoredata + logo: https://opencoredata.org/img/logo22small.png + url: http://opencoredata.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100012874 + propername: opencoredata + domain: https://opencoredata.org/ + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 1 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: "application/ld+json" +summoner: + after: "" + delay: null + headless: http://127.0.0.1:9222 + mode: full + threads: 15 diff --git a/internal/summoner/acquire/acquire.go b/internal/summoner/acquire/acquire.go index 5f2c2d75..f5d019d8 100644 --- a/internal/summoner/acquire/acquire.go +++ b/internal/summoner/acquire/acquire.go @@ -51,17 +51,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt wg.Wait() } -func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, error) { +func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, string, string, error) { bucketName, err := configTypes.GetBucketName(v1) if err != nil { - return bucketName, 0, 0, 0, err + return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err } var mcfg configTypes.Summoner mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner")) if err != nil { - return bucketName, 0, 0, 0, err + return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err } // Set default thread counts and global delay tc := mcfg.Threads @@ -74,9 +74,11 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, err // look for a domain specific override crawl delay sources, err := configTypes.GetSources(v1) source, err := configTypes.GetSourceByName(sources, sourceName) + acceptContent := source.AcceptContentType + jsonProfile := source.JsonProfile hw := source.HeadlessWait if err != nil { - return bucketName, tc, delay, hw, err + return bucketName, tc, delay, hw, acceptContent, jsonProfile, err } if source.Delay != 0 && source.Delay > delay { @@ -85,13 +87,14 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, err log.Info("Crawl delay set to ", delay, " for ", sourceName) } log.Info("Thread count ", tc, " delay ", delay) - return bucketName, tc, delay, hw, nil + + return bucketName, tc, delay, hw, acceptContent, jsonProfile, nil } func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) { - bucketName, tc, delay, headlessWait, err := getConfig(v1, sourceName) + bucketName, tc, delay, headlessWait, acceptContent, jsonProfile, err := getConfig(v1, sourceName) if err != nil { // trying to read a source, so let's not kill everything with a panic/fatal log.Error("Error reading config file ", err) @@ -133,7 +136,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri log.Error(i, err, urlloc) } req.Header.Set("User-Agent", EarthCubeAgent) - req.Header.Set("Accept", "application/ld+json, text/html") + req.Header.Set("Accept", acceptContent) resp, err := client.Do(req) if err != nil { @@ -145,8 +148,31 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri } defer resp.Body.Close() - jsonlds, err := FindJSONInResponse(v1, urlloc, repologger, resp) - + jsonlds, err := FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) + // there was an issue with sitemaps... but now this code + //if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") { + // + // b, err := io.ReadAll(resp.Body) + // // b, err := ioutil.ReadAll(resp.Body) Go.1.15 and earlier + // if err != nil { + // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + // repoStats.Inc(common.Issues) + // lwg.Done() // tell the wait group that we be done + // <-semaphoreChan + // return + // } + // jsonlds = []string{string(b)} + //} else { + // var err error + // jsonlds, err = FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) + // if err != nil { + // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + // repoStats.Inc(common.Issues) + // lwg.Done() // tell the wait group that we be done + // <-semaphoreChan + // return + // } + //} if err != nil { log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) repoStats.Inc(common.Issues) @@ -194,7 +220,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri common.RunRepoStatsOutput(repoStats, sourceName) } -func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger, response *http.Response) ([]string, error) { +func FindJSONInResponse(v1 *viper.Viper, urlloc string, jsonProfile string, repologger *log.Logger, response *http.Response) ([]string, error) { doc, err := goquery.NewDocumentFromResponse(response) if err != nil { return nil, err @@ -206,19 +232,23 @@ func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger, // if the URL is sending back JSON-LD correctly as application/ld+json // this should not be here IMHO, but need to support people not setting proper header value // The URL is sending back JSON-LD but incorrectly sending as application/json + // would like to add contains(contentTypeHeader, jsonProfile) + // but empty profile strings matching all if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) { logFields := log.Fields{"url": urlloc, "contentType": "json or ld_json"} repologger.WithFields(logFields).Debug() log.WithFields(logFields).Debug(urlloc, " as ", contentTypeHeader) - - jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text()) + resp_text := doc.Text() + jsonlds, err = addToJsonListIfValid(v1, jsonlds, resp_text) if err != nil { log.WithFields(logFields).Error("Error processing json response from ", urlloc, err) repologger.WithFields(logFields).Error(err) } - // look in the HTML response for