diff --git a/cmd/husker/main.go b/cmd/husker/main.go index 10bed365..4e408419 100644 --- a/cmd/husker/main.go +++ b/cmd/husker/main.go @@ -62,7 +62,7 @@ func main() { runStats := common.NewRunStats() repostats := runStats.Add(k) - err = acquire.PageRenderAndUpload(v1, mc, 45*time.Second, url, k, rlogginer, repostats) + err = acquire.PageRenderAndUpload(v1, mc, 45*time.Second, url, k, rlogginer, repostats, nil, "") if err != nil { panic(fmt.Errorf("error when reading config: %v", err)) } diff --git a/configs/test_sources/gleaner b/configs/test_sources/gleaner new file mode 100644 index 00000000..adf216a6 --- /dev/null +++ b/configs/test_sources/gleaner @@ -0,0 +1,66 @@ +context: + cache: true +contextmaps: + - file: ./configs/schemaorg-current-https.jsonld + prefix: https://schema.org/ + - file: ./configs/schemaorg-current-https.jsonld + prefix: http://schema.org/ +gleaner: + mill: true + runid: runX + summon: true +millers: + graph: true +minio: + address: oss.geocodes-aws-dev.earthcube.org + port: 443 + ssl: true + bucket: dvtest + region: "" + accesskey: worldsbestaccesskey + secretkey: worldsbestsecretkey +sources: + - sourcetype: sitemap + name: headless + logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png + url: https://earthcube.github.io/GeoCODES-Metadata/site/sitemaps/headless.xml + headless: true + pid: https://www.re3data.org/repository/r3d100010655 + propername: TEST HEADLESS SOURCES + domain: http://wwwearthcube.org/headless/ + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: ' "$.distribution.contentUrl"' + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: mixed + logo: http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png + url: http://ds.iris.edu/files/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010268 + propername: TEST MIXED SOURCES + domain: http://wwwearthcube.org/headless/ + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json +summoner: + after: "" + delay: null + headless: http://127.0.0.1:9222 + mode: full + threads: 5 diff --git a/configs/test_sources/gleaner_base.yaml b/configs/test_sources/gleaner_base.yaml new file mode 100644 index 00000000..d0392bdc --- /dev/null +++ b/configs/test_sources/gleaner_base.yaml @@ -0,0 +1,42 @@ +--- +minio: + address: 0.0.0.0 + port: 9000 + accessKey: worldsbestaccesskey + secretKey: worldsbestsecretkey + ssl: false + bucket: gleaner +gleaner: + runid: runX # this will be the bucket the output is placed in... + summon: true # do we want to visit the web sites and pull down the files + mill: true +context: + cache: true +contextmaps: +- prefix: "https://schema.org/" + file: "./configs/schemaorg-current-https.jsonld" +- prefix: "http://schema.org/" + file: "./configs/schemaorg-current-https.jsonld" +summoner: + after: "" # "21 May 20 10:00 UTC" + mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing + threads: 5 + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + headless: http://127.0.0.1:9222 # URL for headless see docs/headless +millers: + graph: true +# will be built from sources.csv +#sitegraphs: +#- name: aquadocs +# url: https://oih.aquadocs.org/aquadocs.json +# headless: false +# pid: http://hdl.handle.net/1834/41372 +# properName: AquaDocs +# domain: https://aquadocs.org +#sitemaps: +#- name: samplesearth +# url: https://samples.earth/sitemap.xml +# headless: false +# pid: https://www.re3data.org/repository/samplesearth +# properName: Samples Earth (DEMO Site) +# domain: https://samples.earth diff --git a/configs/test_sources/localConfig.yaml b/configs/test_sources/localConfig.yaml new file mode 100644 index 00000000..b5067860 --- /dev/null +++ b/configs/test_sources/localConfig.yaml @@ -0,0 +1,26 @@ +--- +minio: + address: oss.geocodes-aws-dev.earthcube.org + port: 443 + accessKey: worldsbestaccesskey + secretKey: worldsbestsecretkey + ssl: true + bucket: dvtest # can be overridden with MINIO_BUCKET +sparql: + endpoint: https://graph.geocodes-dev.earthcube.org/blazegraph/namespace/earthcube/sparql +s3: + bucket: dvtest # sync with above... can be overridden with MINIO_BUCKET... get's zapped if it's not here. + domain: us-east-1 + +#headless field in gleaner.summoner +headless: http://127.0.0.1:9222 +sourcesSource: + type: csv + location: sources.csv + #location: https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv +# this can be a remote csv +# type: csv +# location: https://docs.google.com/spreadsheets/d/{key}/gviz/tq?tqx=out:csv&sheet={sheet_name} +# TBD -- Just use the sources in the gleaner file. +# type: yaml +# location: gleaner.yaml \ No newline at end of file diff --git a/configs/test_sources/nabu b/configs/test_sources/nabu new file mode 100644 index 00000000..e2b11e0a --- /dev/null +++ b/configs/test_sources/nabu @@ -0,0 +1,23 @@ +minio: + address: oss.geocodes-aws-dev.earthcube.org + port: 443 + ssl: true + bucket: dvtest + region: "" + accesskey: worldsbestaccesskey + secretkey: worldsbestsecretkey +objects: + bucket: dvtest + domain: us-east-1 + prefix: + - summoned/headless + - summoned/mixed + - org + prefixoff: [] +sparql: + endpoint: https://graph.geocodes-dev.earthcube.org/blazegraph/namespace/earthcube/sparql + authenticate: false + username: "" + password: "" +txtaipkg: + endpoint: http://0.0.0.0:8000 diff --git a/configs/test_sources/nabu_base.yaml b/configs/test_sources/nabu_base.yaml new file mode 100644 index 00000000..a5f42ca1 --- /dev/null +++ b/configs/test_sources/nabu_base.yaml @@ -0,0 +1,27 @@ +minio: + accesskey: worldsbestaccesskey + address: 0.0.0.0 + bucket: gleaner + port: 9000 + secretkey: worldsbestsecretkey + ssl: false +objects: + bucket: gleaner + domain: us-east-1 + # prefix will be built using the sources.csv, and additional values + prefix: + - orgs + - summoned/obps + - prov/obps + - summoned/aquadocs + - prov/aquadocs + - milled/marinetraining + - prov/marinetraining + - milled/obis + - prov/obis + - milled/oceanexperts + - prov/oceanexperts +sparql: + endpoint: http://192.168.86.45:32775/blazegraph/namespace/lipd/sparql +txtaipkg: + endpoint: http://0.0.0.0:8000 diff --git a/configs/test_sources/sources.csv b/configs/test_sources/sources.csv new file mode 100644 index 00000000..8a27f21c --- /dev/null +++ b/configs/test_sources/sources.csv @@ -0,0 +1,3 @@ +hack,SourceType,Active,Name,ProperName,URL,Headless,IdentifierType,IdentifierPath,Domain,PID,Logo +3,sitemap,TRUE,headless,TEST HEADLESS SOURCES,https://earthcube.github.io/GeoCODES-Metadata/site/sitemaps/headless.xml,TRUE,identifiersha," ""$.distribution.contentUrl""",http://wwwearthcube.org/headless/,https://www.re3data.org/repository/r3d100010655,https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png +4,sitemap,TRUE,mixed,TEST MIXED SOURCES,http://ds.iris.edu/files/sitemap.xml,FALSE,identifiersha,,http://wwwearthcube.org/headless/,https://www.re3data.org/repository/r3d100010268,http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png \ No newline at end of file diff --git a/internal/common/identifier.go b/internal/common/identifier.go index f544a24a..d0f1cbe8 100644 --- a/internal/common/identifier.go +++ b/internal/common/identifier.go @@ -202,7 +202,12 @@ func GenerateNormalizedSha(v1 *viper.Viper, jsonld string) (Identifier, error) { if uuid == "" { // error log.Error("ERROR: uuid generator:", "Action: Getting normalized sha Error:", err) - id = Identifier{} + id, _ = GenerateFileSha(v1, jsonld) + //id = Identifier{ + // UniqueId: uuid, + // IdentifierType: config.FileSha, + // JsonSha: uuid, + //} } else if err != nil { // no error, then normalized triples generated log.Info(" Action: Normalize sha generated sha:", uuid, " Error:", err) @@ -234,13 +239,13 @@ func GenerateFileSha(v1 *viper.Viper, jsonld string) (Identifier, error) { log.Error("ERROR: uuid generator:", "Action: Getting file sha") id = Identifier{} } - log.Debug(" Action: Json sha generated", uuid) + log.Debug(" Action: file sha generated", uuid) id = Identifier{UniqueId: uuid, - IdentifierType: config.JsonSha, + IdentifierType: config.FileSha, JsonSha: uuid, } - log.Trace("jsonsha: ", uuid) + log.Trace("filesha: ", uuid) // fmt.Println("\njsonsha:", id) return id, nil } diff --git a/internal/common/identifier_test.go b/internal/common/identifier_test.go index 958ec4b7..bcc0fafb 100644 --- a/internal/common/identifier_test.go +++ b/internal/common/identifier_test.go @@ -482,6 +482,77 @@ sources: } } +// this always uses configTypes.IdentifierSha. +func testGenerateIdentifierFallthrough(tests []jsonexpectations, t *testing.T) { + + //mock configre file + // paths are relative to the code + var vipercontext = []byte(` +context: + cache: true +contextmaps: +- file: ../../configs/schemaorg-current-https.jsonld + prefix: https://schema.org/ +- file: ../../configs/schemaorg-current-https.jsonld + prefix: http://schema.org/ +sources: +- sourcetype: sitemap + name: test + logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png + url: https://opentopography.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010655 + propername: OpenTopography + domain: http://www.opentopography.org/ + active: false + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + IdentifierType: identifiersha +`) + + for _, test := range tests { + for i, json := range test.json { + // needs to be defiend in the loop, so that each run has it's own configuration. + // otherwise changing the sources information in a multi-threaded ent has issues + viperVal := viper.New() + viperVal.SetConfigType("yaml") + viperVal.ReadConfig(bytes.NewBuffer(vipercontext)) + sources, err := configTypes.GetSources(viperVal) + + if err != nil { + assert.Fail(t, err.Error()) + } + + s := sources[0] + s.IdentifierType = configTypes.IdentifierSha + s.IdentifierPath = test.IdentifierPaths + t.Run(fmt.Sprint(test.name, "_", i), func(t *testing.T) { + if test.ignore { + return + } + path := filepath.Join("testdata", "identifier", json) + assert.FileExistsf(t, path, "Datafile Missing: {path}") + source, err := os.ReadFile(path) + if err != nil { + t.Fatal("error reading source file:", err) + } + result, err := GenerateIdentifier(viperVal, s, string(source)) + //valStr := fmt.Sprint(result.uniqueId) + assert.Equal(t, test.expected, result.UniqueId, "uuid faild") + assert.Equal(t, test.expectedPath, result.MatchedPath, "matched path failed") + assert.Equal(t, test.IdentifierType, result.IdentifierType, "identifier failed") + if test.errorExpected { + assert.NotNil(t, err) + } else { + assert.Nil(t, err) + } + + }) + } + } +} func TestGenerateFileShaIdentifier(t *testing.T) { var tests = []jsonexpectations{ @@ -628,3 +699,38 @@ func TestValidJsonPathGraphInput(t *testing.T) { testValidJsonPaths(tests, t) } + +func TestGenerateBadIdentifier(t *testing.T) { + + var tests = []jsonexpectations{ + // default + // should work for all + + { + name: "extra_slash_jsonsha", + json: map[string]string{ + "bad_json": "bad_json.jsonld", + }, + errorExpected: true, + IdentifierType: configTypes.FileSha, + IdentifierPaths: "", + expected: "cc8c395af8163203dda2c6bee35fd728071bd809", + expectedPath: "", + ignore: false, + }, + //{ + // name: "extra_slash_identifier", + // json: map[string]string{ + // "bad_json": "bad_json.jsonld", + // }, + // errorExpected: false, + // IdentifierType: configTypes.FileSha, + // IdentifierPaths: "", + // expected: "cc8c395af8163203dda2c6bee35fd728071bd809", + // expectedPath: "", + // ignore: false, + //}, + } + + testGenerateIdentifierFallthrough(tests, t) +} diff --git a/internal/common/logger.go b/internal/common/logger.go index b701ee3a..90fc7cd0 100644 --- a/internal/common/logger.go +++ b/internal/common/logger.go @@ -35,6 +35,7 @@ func InitLogging() { log.SetFormatter(&log.JSONFormatter{}) // Log as JSON instead of the default ASCII formatter. log.SetReportCaller(true) // include file name and line number + log.SetLevel(log.InfoLevel) mw := io.MultiWriter(os.Stdout, logFile) log.SetOutput(mw) //log.SetOutput(logFile) diff --git a/internal/common/stats.go b/internal/common/stats.go index bc237417..d2339329 100644 --- a/internal/common/stats.go +++ b/internal/common/stats.go @@ -65,6 +65,10 @@ const EmptyDoc string = "SitemapEmptyDoc" const Stored string = "SitemapStored" const StoreError string = "SitemapStoredError" const HeadlessError string = "HeadlessServerError" +const NotAuthorized string = "NotAuthorized" +const BadUrl string = "BadURL404" +const RepoServerError string = "RepoServerError" +const GenericIssue = "GenericUrlIssue" // Inc increments the counter for the given key. func (c *RepoStats) Inc(key string) { diff --git a/internal/common/testdata/identifier_bad/bad_json.jsonld b/internal/common/testdata/identifier_bad/bad_json.jsonld new file mode 100644 index 00000000..8bd2252e --- /dev/null +++ b/internal/common/testdata/identifier_bad/bad_json.jsonld @@ -0,0 +1,268 @@ +{ + "@context": { + "@vocab": "https://schema.org/", + "gdx": "https://geodex.org/voc/", + "datacite": "http://purl.org/spar/datacite/" + }, + "@type": ["Service", "Organization", "WebSite", "WebPage"], + "@id": "https://www.rvdata.us", + "additionalType": "https://geodex.org/voc/ResearchRepositoryService", + "legalName": "Rolling Deck to Repository", + "name": "R2R", + "url": "https://www.rvdata.us", + "category": ["Marine Geoscience", "Marine Geophysics", "Oceanography", "Meteorology"], + "description": "The Rolling Deck to Repository (R2R) program manages underway environmental sensor data from scientific expeditions for the U.S. academic research fleet. R2R maintains a comprehensive catalog of modern academic fleet expeditions with inventories of end-of-cruise data distributions from each vessel operator, and provides access to the routinely collected environmental data obtained from ship-board sensors for each expedition. These data are submitted to the NOAA Data Centers for long-term preservation and data quality assessments are provided for the most common data streams. R2R creates a number of value-added products including quality-controlled shiptrack navigation, geophysical profiles (gravity, magnetics, bathymetry), CTD data and real-time meteorology/near-surface oceanography in partnership with the SAMOS program. Data from each cruise are submitted directly to R2R by vessel operators, rather than from science parties. R2R works in close collaboration with the U.S. University-National Oceanographic Laboratory System, vessel technicians, NOAA Data Centers, and other U.S. and international ocean data systems.", + "sameAs": [ + "https://www.re3data.org/repository/r3d100010735" + ], + "logo": { + "@type": "ImageObject", + "url": "https://www.rvdata.us/images/Logo.4b1519be.png" + }, + "foundingDate": "2009-09-01", + "address": { + "@type": "PostalAddress", + "streetAddress": "61 Route 9W", + "addressLocality": "Palisades", + "addressRegion": "NY", + "postalCode": "10964-1000", + "addressCountry": "USA" + }, + "contactPoint": { + "@id": "https://www.rvdata.us/about", + "@type": "ContactPoint", + "name": "R2R Manager", + "email": "info@rvdata.us", + "url": "https://www.rvdata.us/contact", + "contactType": "customer support" + }, + "provider": { + "@id": "https://www.rvdata.us" + }, + "funder": { + "@type": "Organization", + "@id": "https://dx.doi.org/10.13039/100000141", + "legalName": "Division of Ocean Sciences", + "alternateName": "OCE", + "url": "https://www.nsf.gov/div/index.jsp?div=OCE", + "identifier": { + "@type": "PropertyValue", + "propertyID": "http://purl.org/spar/datacite/:doi", + "value": "10.13039/100000141", + "url": "https://doi.org/10.13039/100000141" + }, + "parentOrganization": { + "@type": "Organization", + "@id": "https://dx.doi.org/10.13039/100000085", + "legalName": "Directorate for Geosciences", + "alternateName": "NSF-GEO", + "url": "https://www.nsf.gov", + "identifier": { + "@type": "PropertyValue", + "propertyID": "http://purl.org/spar/datacite/:doi", + "value": "10.13039/100000085", + "url": "https://doi.org/10.13039/100000085" + }, + "parentOrganization": { + "@type": "Organization", + "@id": "https://dx.doi.org/10.13039/100000001", + "legalName": "National Science Foundation", + "alternateName": "NSF", + "url": "https://www.nsf.gov", + "identifier": { + "@type": "PropertyValue", + "propertyID": "http://purl.org/spar/datacite/:doi", + "value": "10.13039/100000001", + "url": "https://doi.org/10.13039/100000001" + } + } + } + }, + "parentOrganization": [ + { + "@type": "Organization", + "@id": "https://viaf.org/viaf/142992181/", + "name": "Lamont-Doherty Earth Observatory", + "url": "https://www.ldeo.columbia.edu", + "address": { + "@type": "PostalAddress", + "streetAddress": "61 Route 9W", + "addressLocality": "Palisades", + "addressRegion": "NY", + "postalCode": "10964-1000", + "addressCountry": "USA" + } + }, + { + "@type": "Organization", + "@id": "https://viaf.org/viaf/156836332/", + "legalName": "Columbia University", + "url": "https://www.columbia.edu/" + } + ], + "publishingPrinciples": [ + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-Access", + "name": "Data Access", + "url": "https://www.rvdata.us/about/policy/access", + "fileFormat": "text/html" + }, + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-ResourceSubmissionPolicy", + "name": "NSF OCE Sample and Data Policy", + "url": "https://www.nsf.gov/publications/pub_summ.jsp?ods_key=nsf17037&org=NSF", + "fileFormat": "text/html" + }, + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-ResourceSubmissionPolicy", + "name": "Data Submission", + "url": "https://www.rvdata.us/about/data-policies-and-repositories/data-submission", + "fileFormat": "text/html" + } + ], + "hasOfferCatalog": [ + { + "@type": "OfferCatalog", + "name": "R2R", + "itemListElement": [ + { + "@type": "DataCatalog", + "@id": "https://www.rvdata", + "name": "R2R", + "publishingPrinciples": [ + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-Access", + "name": "Data Access", + "url": "https://www.rvdata.us/about/data-policies-and-repositories/data-access", + "fileFormat": "text/html" + }, + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-ResourceSubmissionPolicy", + "name": "NSF OCE Sample and Data Policy", + "url": "https://www.nsf.gov/publications/pub_summ.jsp?ods_key=nsf17037&org=NSF", +/ "fileFormat": "text/html" + }, + { + "@type": "DigitalDocument", + "additionalType": "https://geodex.org/voc/Protocol-ResourceSubmissionPolicy", + "name": "Data Submission", + "url": "https://www.rvdata.us/about/data-policies-and-repositories/data-submission", + "fileFormat": "text/html" + } + ] + } + ] + }, + { + "@type": "OfferCatalog", + "additionalType": "https://geodex.org/voc/ResearchResourceTypes", + "itemListElement": [ + {"@type": "Thing", "@id": "schema:Dataset", "name": "Dataset"} + ], + "name": "R2R Cruise Database" + }, + { + "@type": "OfferCatalog", + "additionalType": "https://geodex.org/voc/ResearchResourceTypes", + "itemListElement": [ + { + "@type": "thing", + "@id": "https://geodex.org/voc/ResearchResourceTypes", + "additionalType": "https://geodex.org/voc/ResearchResourceTypes", + "name": "R2R Resources and Best Practices", + "description": "R2R supplied information for scientist, vessel operators and users. These documents help plan for, participate in and archive the data/metadat for a cruise." + } + ] + } + ], + "availableChannel": [ + { + "@type": "ServiceChannel", + "serviceUrl": "https://data.rvdata.us/sparql", + "providesService": { + "@type": "Service", + "additionalType": "https://geodex.org/voc/SearchService", + "name": "R2R Linked Data (SPARQL Endpoint) Cruise Catalog", + "description": "Explore Entire R2R Linked Data (SPARQL Endpoing) cruise catalog", + "potentialAction": { "@id": "https://www.rvdata.us/search" } + } + }, + { + "@type": "ServiceChannel", + "serviceUrl": "https://api.rvdata.us/catalog?service=CSW&version=3.0.0&request=GetCapabilities", + "providesService": { + "@type": "Service", + "additionalType": "https://geodex.org/voc/SearchService", + "name": "R2R OGC CSW Cruise Descriptions", + "description": "Get overview of capablities of cruise description service.", + "potentialAction": { "@id": "https://api.rvdata.us/catalog?service=CSW&version=3.0.0&request=GetCapabilities" } + } + }, + { + "@type": "ServiceChannel", + "serviceUrl": "https://api.rvdata.us/rss/cruise/program/OOI", + "providesService": { + "@type": "Service", + "@id": "https://api.rvdata.us/rss/cruise/program/OOI", + "additionalType": "https://geodex.org/voc/SyndicationService", + "name": "R2R Cruise DOI syndication", + "description": "Get OOI list of R2R cruises" + } + }, + { + "@type": "ServiceChannel", + "serviceUrl": "https://api.rvdata.us/rss/cruise/program/OOI", + "providesService": { + "@type": "Service", + "additionalType": "https://geodex.org/voc/SearchService", + "name": "R2R OGC WFS Cruise Tracklines", + "description": "R2R GeoRSS/Atom Service of Cruises" + } + } + ], + "potentialAction": [ + { + "@id": "https://www.rvdata.us/search/cruise", + "@type": "SearchAction", + "name": "R2R Cruise search", + "description": "Link to web page enabling users to construct and submit queries to the R2R catalog.", + "target": { + "@type": "EntryPoint", + "url": "https://www.rvdata.us/search", + "httpMethod": "GET", + "urlTemplate": "https://www.rvdata.us/search/cruise/{cruise_id}" + }, + "query-input": "required name=cruise_id" + }, + { + "@id": "https://www.rvdata.us/search/fileset", + "@type": "SearchAction", + "name": "R2R Fileset search", + "description": "Link to web page enabling users to construct and submit queries to the R2R catalog.", + "target": { + "@type": "EntryPoint", + "url": "https://www.rvdata.us/search", + "httpMethod": "GET", + "urlTemplate": "https://www.rvdata.us/search/fileset/{fileset_id}" + }, + "query-input": "required name=fileset_id" + }, + { + "@id": "https://www.rvdata.us/search", + "@type": "SearchAction", + "name": "R2R Cruise Catalog", + "description": "Link to web page enabling users to locate all cruises for a vessel in the R2R catalog.", + "target": { + "@type": "EntryPoint", + "url": "https://www.rvdata.us/search", + "urlTemplate": "https://www.rvdata.us/search/vessel/{vessel_id}" + }, + "query-input": "required name=vessel_id" + } + ] + } \ No newline at end of file diff --git a/internal/config/sources.go b/internal/config/sources.go index 9ee511a2..d5909920 100644 --- a/internal/config/sources.go +++ b/internal/config/sources.go @@ -19,6 +19,7 @@ const ( NormalizedJsonSha = "normalizedjsonsha" IdentifierString = "identifierstring" SourceUrl = "sourceurl" + FileSha = "filesha" ) type ContextOption int64 diff --git a/internal/summoner/acquire/acquire.go b/internal/summoner/acquire/acquire.go index f5d019d8..daa9d249 100644 --- a/internal/summoner/acquire/acquire.go +++ b/internal/summoner/acquire/acquire.go @@ -1,7 +1,14 @@ package acquire import ( + "context" + "github.com/chromedp/chromedp" "github.com/gleanerio/gleaner/internal/common" + "github.com/mafredri/cdp" + "github.com/mafredri/cdp/devtool" + "github.com/mafredri/cdp/protocol/target" + "github.com/mafredri/cdp/rpcc" + "github.com/mafredri/cdp/session" "net/http" "net/url" "strings" @@ -51,17 +58,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt wg.Wait() } -func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, string, string, error) { +func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, string, string, error, bool) { bucketName, err := configTypes.GetBucketName(v1) if err != nil { - return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err + return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err, false } var mcfg configTypes.Summoner mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner")) if err != nil { - return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err + return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err, false } // Set default thread counts and global delay tc := mcfg.Threads @@ -77,8 +84,9 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, str acceptContent := source.AcceptContentType jsonProfile := source.JsonProfile hw := source.HeadlessWait + headless := source.Headless if err != nil { - return bucketName, tc, delay, hw, acceptContent, jsonProfile, err + return bucketName, tc, delay, hw, acceptContent, jsonProfile, err, false } if source.Delay != 0 && source.Delay > delay { @@ -88,13 +96,17 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, str } log.Info("Thread count ", tc, " delay ", delay) - return bucketName, tc, delay, hw, acceptContent, jsonProfile, nil + return bucketName, tc, delay, hw, acceptContent, jsonProfile, nil, headless } func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) { - bucketName, tc, delay, headlessWait, acceptContent, jsonProfile, err := getConfig(v1, sourceName) + var timeout = 60 * time.Second + var retries = 3 + var totalTimeout = timeout * time.Duration(retries+1) + + bucketName, tc, delay, headlessWait, acceptContent, jsonProfile, err, headless := getConfig(v1, sourceName) if err != nil { // trying to read a source, so let's not kill everything with a panic/fatal log.Error("Error reading config file ", err) @@ -103,6 +115,58 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri var client http.Client + // stuff to setup headless sessions + if headlessWait < 0 { + log.Info("Headless wait on a headless configured to less that zero. Setting to 0") + headlessWait = 0 // if someone screws up the config, be good + } + + if totalTimeout < time.Duration(headlessWait)*time.Second { + timeout = time.Duration(headlessWait) * time.Second + } + /// if you cancel here, then everything after first times out + //ctx, cancel := context.WithTimeout(context.Background(), timeout*time.Duration(retries)) + //ctx, cancel := context.WithTimeout(context.TODO(), timeout*time.Duration(retries)) + //defer cancel() + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + // read config file + mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner")) + + // Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers). + //devt := devtool.New(mcfg["headless"]) + devt := devtool.New(mcfg.Headless) + + pt, err := devt.Get(ctx, devtool.Page) + if err != nil { + pt, err = devt.Create(ctx) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + repoStats.Inc(common.HeadlessError) + return + } + } + + // Initiate a new RPC connection to the Chrome DevTools Protocol target. + conn, err := rpcc.DialContext(ctx, pt.WebSocketDebuggerURL) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + repoStats.Inc(common.HeadlessError) + return + } + defer conn.Close() + sessionclient := cdp.NewClient(conn) + m, err := session.NewManager(sessionclient) + if err != nil { + // Handle error. + } + defer m.Close() + + // session + semaphoreChan := make(chan struct{}, tc) // a blocking channel to keep concurrency under control lwg := sync.WaitGroup{} @@ -131,82 +195,225 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri repologger.Trace("Indexing", urlloc) log.Debug("Indexing ", urlloc) - req, err := http.NewRequest("GET", urlloc, nil) - if err != nil { - log.Error(i, err, urlloc) - } - req.Header.Set("User-Agent", EarthCubeAgent) - req.Header.Set("Accept", acceptContent) - - resp, err := client.Do(req) - if err != nil { - log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) - repologger.WithFields(log.Fields{"url": urlloc}).Error(err) - lwg.Done() // tell the wait group that we be done - <-semaphoreChan - return - } - defer resp.Body.Close() - - jsonlds, err := FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) - // there was an issue with sitemaps... but now this code - //if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") { - // - // b, err := io.ReadAll(resp.Body) - // // b, err := ioutil.ReadAll(resp.Body) Go.1.15 and earlier - // if err != nil { - // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) - // repoStats.Inc(common.Issues) - // lwg.Done() // tell the wait group that we be done - // <-semaphoreChan - // return - // } - // jsonlds = []string{string(b)} - //} else { - // var err error - // jsonlds, err = FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) - // if err != nil { - // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) - // repoStats.Inc(common.Issues) - // lwg.Done() // tell the wait group that we be done - // <-semaphoreChan - // return - // } - //} - if err != nil { - log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) - repoStats.Inc(common.Issues) - lwg.Done() // tell the wait group that we be done - <-semaphoreChan - return - } - - // For incremental indexing I want to know every URL I visit regardless - // if there is a valid JSON-LD document or not. For "full" indexing we - // visit ALL URLs. However, many will not have JSON-LD, so let's also record - // and avoid those during incremental calls. - - // even is no JSON-LD packages found, record the event of checking this URL - if len(jsonlds) < 1 { - // TODO is her where I then try headless, and scope the following for into an else? - if headlessWait >= 0 { - log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc) - repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file - err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats) // TODO make delay configurable + if headless { + log.WithFields(log.Fields{"url": urlloc, "issue": "running headless"}).Trace("Headless ", urlloc) + args := target.NewCreateTargetArgs("") + //args.SetNewWindow(true) + newPage, err := sessionclient.Target.CreateTarget(ctx, + args) + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{"url": urlloc}).Error("Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?") + repoStats.Inc(common.HeadlessError) + lwg.Done() + <-semaphoreChan + return + } + closeArgs := target.NewCloseTargetArgs(newPage.TargetID) + defer func(Target cdp.Target, ctx context.Context, args *target.CloseTargetArgs) { + log.Infof("Close Target Defer targetID: %s url: %s ", newPage.TargetID, urlloc) + _, err := Target.CloseTarget(ctx, args) if err != nil { - log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err) - repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err) + log.WithFields(log.Fields{"url": urlloc, "issue": "error closing target"}).Error("PageRenderAndUpload ", urlloc, "::", err) + } + }(sessionclient.Target, ctx, closeArgs) + // newPageConn uses the underlying conn without establishing a new + // websocket connection. + //newPageConn, err := m.Dial(ctx, newPage.TargetID) + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "Not REPO FAULT. newPageConn... Is Headless Container running?"}).Error(err) + // repologger.WithFields(log.Fields{"url": urlloc}).Error("Not REPO FAULT. newPageConn... Is Headless Container running?") + // repoStats.Inc(common.HeadlessError) + // lwg.Done() + // <-semaphoreChan + // return + //} + ////defer func(newPageConn *rpcc.Conn) { + //// log.Info("NewPageConn defer") + //// err := newPageConn.Close() + //// if err != nil { + //// log.WithFields(log.Fields{"url": urlloc, "issue": "error clocing connection"}).Error("PageRenderAndUpload ", urlloc, "::", err) + //// + //// } + ////}(newPageConn) + // + //c := cdp.NewClient(newPageConn) + err = PageRenderAndUpload(v1, mc, timeout, urlloc, sourceName, repologger, repoStats, m, newPage.TargetID) // TODO make delay configurable + + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, " ::", err) + repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err) + //err = newPageConn.Close() + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error closing connection"}).Error("PageRenderAndUpload ", urlloc, " ::", err) + // + //} + //closeTargetResp, err := sessionclient.Target.CloseTarget(ctx, closeArgs) + //log.Info(closeTargetResp) + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error closing target"}).Error("PageRenderAndUpload ", urlloc, " ::", err) + // + //} + lwg.Done() + <-semaphoreChan + return } - + //err = newPageConn.Close() + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error closing connection"}).Error("PageRenderAndUpload ", urlloc, " ::", err) + // + //} + //closeTargetResp, err := sessionclient.Target.CloseTarget(ctx, closeArgs) + //log.Info(closeTargetResp) + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error closing target"}).Error("PageRenderAndUpload ", urlloc, " ::", err) + // + //} } else { - log.WithFields(log.Fields{"url": urlloc, "issue": "Direct access worked"}).Trace("Direct access worked for ", urlloc) - repologger.WithFields(log.Fields{"url": urlloc, "issue": "Direct access worked"}).Trace() - repoStats.Inc(common.Summoned) - } + req, err := http.NewRequest("GET", urlloc, nil) + if err != nil { + log.Error(i, err, urlloc) + } + req.Header.Set("User-Agent", EarthCubeAgent) + req.Header.Set("Accept", acceptContent) + + resp, err := client.Do(req) + if err != nil { + log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + repologger.WithFields(log.Fields{"url": urlloc}).Error(err) + lwg.Done() // tell the wait group that we be done + <-semaphoreChan + return + } + defer resp.Body.Close() + + // if there is an error, then don't try again. + if (resp.StatusCode >= 400) && (resp.StatusCode < 600) { + switch resp.StatusCode { + case 403: + log.Error("#", i, " not authorized ", urlloc, err) // print an message containing the index (won't keep order) + repologger.WithFields(log.Fields{"url": urlloc}).Error(err) + repoStats.Inc(common.NotAuthorized) + case 404: + log.Error("#", i, " bad url ", urlloc, err) // print an message containing the index (won't keep order) + repologger.WithFields(log.Fields{"url": urlloc}).Error(err) + repoStats.Inc(common.BadUrl) + case 500: + log.Error("#", i, " server arror ", urlloc, err) // print an message containing the index (won't keep order) + repologger.WithFields(log.Fields{"url": urlloc}).Error(err) + repoStats.Inc(common.RepoServerError) + default: + log.Error("#", i, " generic arror ", urlloc, err) // print an message containing the index (won't keep order) + repologger.WithFields(log.Fields{"url": urlloc}).Error(err) + repoStats.Inc(common.GenericIssue) + } + lwg.Done() // tell the wait group that we be done + <-semaphoreChan + return + } - UploadWrapper(v1, mc, bucketName, sourceName, urlloc, repologger, repoStats, jsonlds) + jsonlds, err := FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) + // there was an issue with sitemaps... but now this code + //if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") { + // + // b, err := io.ReadAll(resp.Body) + // // b, err := ioutil.ReadAll(resp.Body) Go.1.15 and earlier + // if err != nil { + // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + // repoStats.Inc(common.Issues) + // lwg.Done() // tell the wait group that we be done + // <-semaphoreChan + // return + // } + // jsonlds = []string{string(b)} + //} else { + // var err error + // jsonlds, err = FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp) + // if err != nil { + // log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + // repoStats.Inc(common.Issues) + // lwg.Done() // tell the wait group that we be done + // <-semaphoreChan + // return + // } + //} + if err != nil { + log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order) + repoStats.Inc(common.Issues) + lwg.Done() // tell the wait group that we be done + <-semaphoreChan + return + } + + // For incremental indexing I want to know every URL I visit regardless + // if there is a valid JSON-LD document or not. For "full" indexing we + // visit ALL URLs. However, many will not have JSON-LD, so let's also record + // and avoid those during incremental calls. + + // even is no JSON-LD packages found, record the event of checking this URL + if len(jsonlds) < 1 { + // TODO is her where I then try headless, and scope the following for into an else? + if headlessWait >= 0 { + log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc) + repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file + args := target.NewCreateTargetArgs("") + //args.SetNewWindow(true) + newPage, err := sessionclient.Target.CreateTarget(ctx, + args) + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{"url": urlloc}).Error("Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?") + repoStats.Inc(common.HeadlessError) + lwg.Done() + <-semaphoreChan + return + } + closeArgs := target.NewCloseTargetArgs(newPage.TargetID) + defer func(Target cdp.Target, ctx context.Context, args *target.CloseTargetArgs) { + log.Info("Close Target Defer") + _, err := Target.CloseTarget(ctx, args) + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "error closing target"}).Error("PageRenderAndUpload ", urlloc, "::", err) + + } + }(sessionclient.Target, ctx, closeArgs) + // newPageConn uses the underlying conn without establishing a new + // websocket connection. + //newPageConn, err := m.Dial(ctx, newPage.TargetID) + //if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "Not REPO FAULT. newPageConn... Is Headless Container running?"}).Error(err) + // repologger.WithFields(log.Fields{"url": urlloc}).Error("Not REPO FAULT. newPageConn... Is Headless Container running?") + // repoStats.Inc(common.HeadlessError) + // lwg.Done() + // <-semaphoreChan + // return + //} + //defer func(newPageConn *rpcc.Conn) { + // log.Info("NewPageConn defer") + // err := newPageConn.Close() + // if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error clocing connection"}).Error("PageRenderAndUpload ", urlloc, "::", err) + // + // } + //}(newPageConn) + // + //c := cdp.NewClient(newPageConn) + err = PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, repologger, repoStats, m, newPage.TargetID) // TODO make delay configurable + if err != nil { + log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err) + repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err) + } + } + + } else { + log.WithFields(log.Fields{"url": urlloc, "issue": "Direct access worked"}).Trace("Direct access worked for ", urlloc) + repologger.WithFields(log.Fields{"url": urlloc, "issue": "Direct access worked"}).Trace() + repoStats.Inc(common.Summoned) + } + UploadWrapper(v1, mc, bucketName, sourceName, urlloc, repologger, repoStats, jsonlds) + } // else headless bar.Add(1) // bar.Incr() log.Trace("#", i, "thread for", urlloc) // print an message containing the index (won't keep order) time.Sleep(time.Duration(delay) * time.Millisecond) // sleep a bit if directed to by the provider diff --git a/internal/summoner/acquire/acquire_test.go b/internal/summoner/acquire/acquire_test.go index e6cbd01e..927999ff 100644 --- a/internal/summoner/acquire/acquire_test.go +++ b/internal/summoner/acquire/acquire_test.go @@ -28,7 +28,7 @@ func TestGetConfig(t *testing.T) { } viper := ConfigSetupHelper(conf) - bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource") + bucketName, tc, delay, _, _, _, err, _ := getConfig(viper, "testSource") assert.Equal(t, "test", bucketName) assert.Equal(t, 5, tc) assert.Equal(t, int64(0), delay) @@ -43,7 +43,7 @@ func TestGetConfig(t *testing.T) { } viper := ConfigSetupHelper(conf) - bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource") + bucketName, tc, delay, _, _, _, err, _ := getConfig(viper, "testSource") assert.Equal(t, "test", bucketName) assert.Equal(t, 1, tc) assert.Equal(t, int64(1000), delay) @@ -58,7 +58,7 @@ func TestGetConfig(t *testing.T) { } viper := ConfigSetupHelper(conf) - bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource") + bucketName, tc, delay, _, _, _, err, _ := getConfig(viper, "testSource") assert.Equal(t, "test", bucketName) assert.Equal(t, 5, tc) assert.Equal(t, int64(0), delay) @@ -73,7 +73,7 @@ func TestGetConfig(t *testing.T) { } viper := ConfigSetupHelper(conf) - bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource") + bucketName, tc, delay, _, _, _, err, _ := getConfig(viper, "testSource") assert.Equal(t, "test", bucketName) assert.Equal(t, 1, tc) assert.Equal(t, int64(100), delay) @@ -88,7 +88,7 @@ func TestGetConfig(t *testing.T) { } viper := ConfigSetupHelper(conf) - bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource") + bucketName, tc, delay, _, _, _, err, _ := getConfig(viper, "testSource") assert.Equal(t, "test", bucketName) assert.Equal(t, 1, tc) assert.Equal(t, int64(50), delay) diff --git a/internal/summoner/acquire/api.go b/internal/summoner/acquire/api.go index c9655ec0..04fdd98b 100644 --- a/internal/summoner/acquire/api.go +++ b/internal/summoner/acquire/api.go @@ -56,8 +56,8 @@ func RetrieveAPIData(apiSources []configTypes.Sources, mc *minio.Client, runStat } func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) { - - bucketName, tc, delay, _, acceptContent, jsonProfile, err := getConfig(v1, source.Name) // _ is headless wait + // _ is headless + bucketName, tc, delay, _, acceptContent, jsonProfile, err, _ := getConfig(v1, source.Name) // _ is headless wait if err != nil { // trying to read a source, so let's not kill everything with a panic/fatal log.Error("Error reading config file ", err) diff --git a/internal/summoner/acquire/headlessNG.go b/internal/summoner/acquire/headlessNG.go index c1bb0272..c9d6c67b 100644 --- a/internal/summoner/acquire/headlessNG.go +++ b/internal/summoner/acquire/headlessNG.go @@ -4,17 +4,21 @@ import ( "context" "encoding/json" "fmt" + target2 "github.com/chromedp/cdproto/target" + "github.com/chromedp/chromedp" "github.com/gleanerio/gleaner/internal/common" + "github.com/mafredri/cdp/devtool" + "github.com/mafredri/cdp/protocol/target" + "github.com/mafredri/cdp/rpcc" + "github.com/mafredri/cdp/session" log "github.com/sirupsen/logrus" "time" configTypes "github.com/gleanerio/gleaner/internal/config" "github.com/mafredri/cdp" - "github.com/mafredri/cdp/devtool" "github.com/mafredri/cdp/protocol/page" "github.com/mafredri/cdp/protocol/runtime" - "github.com/mafredri/cdp/rpcc" minio "github.com/minio/minio-go/v7" "github.com/spf13/viper" "github.com/valyala/fasttemplate" @@ -32,6 +36,9 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSta // buf bytes.Buffer // logger = log.New(&buf, "logger: ", log.Lshortfile) //) + var timeout = 60 * time.Second + var retries = 3 + var totalTimeout = timeout * time.Duration(retries+1) for k := range m { r := runStats.Add(k) @@ -44,10 +51,83 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSta } else { repologger.Info("Headless chrome call to ", k) } + _, _, _, headlessWait, _, _, err, _ := getConfig(v1, k) + if err != nil { + // trying to read a source, so let's not kill everything with a panic/fatal + log.Error("Error reading config file ", err) + repologger.Error("Error reading config file ", err) + } + // stuff to setup headless sessions + if headlessWait < 0 { + log.Info("Headless wait on a headless configured to less that zero. Setting to 0") + headlessWait = 0 // if someone screws up the config, be good + } + if totalTimeout < time.Duration(headlessWait)*time.Second { + timeout = time.Duration(headlessWait) * time.Second + } + //ctx, cancel := context.WithTimeout(context.Background(), timeout*time.Duration(retries)) + //ctx, cancel := context.WithTimeout(context.TODO(), timeout*time.Duration(retries)) + //defer cancel() + ctx, cancel := chromedp.NewContext(context.TODO()) + defer cancel() + + // read config file + mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner")) + + // Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers). + //devt := devtool.New(mcfg["headless"]) + devt := devtool.New(mcfg.Headless) + + pt, err := devt.Get(ctx, devtool.Page) + if err != nil { + pt, err = devt.Create(ctx) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + + return + } + } + + // Initiate a new RPC connection to the Chrome DevTools Protocol target. + conn, err := rpcc.DialContext(ctx, pt.WebSocketDebuggerURL) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + + return + } + defer conn.Close() + sessionclient := cdp.NewClient(conn) + manager, err := session.NewManager(sessionclient) + if err != nil { + // Handle error. + } + defer manager.Close() + + // session for i := range m[k] { + args := target.NewCreateTargetArgs("") + //args.SetNewWindow(true) + newPage, err := sessionclient.Target.CreateTarget(ctx, + args) + if err != nil { + log.WithFields(log.Fields{"url": m[k][i], "issue": "Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{"url": m[k][i]}).Error("Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?") + + return + } + closeArgs := target.NewCloseTargetArgs(newPage.TargetID) + defer func(Target cdp.Target, ctx context.Context, args *target.CloseTargetArgs) { + log.Info("Close Target Defer") + _, err := Target.CloseTarget(ctx, args) + if err != nil { + log.WithFields(log.Fields{"url": m[k][i], "issue": "error closing target"}).Error("PageRenderAndUpload ", m[k][i], " ::", err) - err := PageRenderAndUpload(v1, mc, 60*time.Second, m[k][i], k, repologger, r) // TODO make delay configurable + } + }(sessionclient.Target, ctx, closeArgs) + err = PageRenderAndUpload(v1, mc, timeout, m[k][i], k, repologger, r, manager, newPage.TargetID) // TODO make delay configurable if err != nil { log.Error(m[k][i], "::", err) } @@ -141,7 +221,7 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSta // //} -func PageRenderAndUpload(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k string, repologger *log.Logger, repoStats *common.RepoStats) error { +func PageRenderAndUpload(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k string, repologger *log.Logger, repoStats *common.RepoStats, m *session.Manager, targetID target.ID) error { repologger.WithFields(log.Fields{"url": url}).Trace("PageRenderAndUpload") // page render handles this //ctx, cancel := context.WithTimeout(context.Background(), timeout) @@ -155,22 +235,22 @@ func PageRenderAndUpload(v1 *viper.Viper, mc *minio.Client, timeout time.Duratio //mcfg := v1.GetStringMapString("summoner") //mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner")) - jsonlds, err := PageRender(v1, timeout, url, k, repologger, repoStats) + jsonlds, err := PageRender(v1, timeout, url, k, repologger, repoStats, m, targetID) if err == nil { // from page render. If there are no errros, upload. if len(jsonlds) > 1 { log.WithFields(log.Fields{"url": url, "issue": "Multiple JSON"}).Info("Error uploading jsonld to object store:", url) - repologger.WithFields(log.Fields{"url": url, "issue": "Multiple JSON"}).Debug() + repologger.WithFields(log.Fields{"url": url, "issue": "Multiple JSON"}).Info() } - for _, jsonld := range jsonlds { - sha, err := Upload(v1, mc, bucketName, k, url, jsonld) - if err != nil { - log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err, sha) - repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err) + for i, jsonld := range jsonlds { + sha, err2 := Upload(v1, mc, bucketName, k, url, jsonld) + if err2 != nil { + log.WithFields(log.Fields{"url": url, "sha": sha, "jsonld#": i, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err2, sha) + repologger.WithFields(log.Fields{"url": url, "sha": sha, "jsonld#": i, "issue": "Error uploading jsonld to object store"}).Error(err2) repoStats.Inc(common.StoreError) } else { - log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Info("Uploaded JSONLD to object store:", url, err, sha) - repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Debug() + log.WithFields(log.Fields{"url": url, "sha": sha, "jsonld#": i, "issue": "Uploaded JSONLD to object store"}).Info("Uploaded JSONLD to object store:", url, sha) + repologger.WithFields(log.Fields{"url": url, "sha": sha, "jsonld#": i, "issue": "Uploaded JSONLD to object store"}).Info() repoStats.Inc(common.Stored) } } @@ -178,12 +258,15 @@ func PageRenderAndUpload(v1 *viper.Viper, mc *minio.Client, timeout time.Duratio return err } -func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologger *log.Logger, repoStats *common.RepoStats) ([]string, error) { +func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologger *log.Logger, repoStats *common.RepoStats, m *session.Manager, targetID target.ID) ([]string, error) { + repologger.WithFields(log.Fields{"url": url}).Trace("PageRender") retries := 3 + var totalTimeout = timeout * time.Duration(retries+1) //add some time for cleanup. sources, err := configTypes.GetSources(v1) source, err := configTypes.GetSourceByName(sources, k) headlessWait := source.HeadlessWait + response := []string{} if headlessWait < 0 { log.Info("Headless wait on a headless configured to less that zero. Setting to 0") headlessWait = 0 // if someone screws up the config, be good @@ -193,39 +276,102 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge timeout = time.Duration(headlessWait) * time.Second } - ctx, cancel := context.WithTimeout(context.Background(), timeout*time.Duration(retries)) + // context for rpcc.Conn with timeout. + //ctx, cancel := context.WithTimeout(context.Background(), timeout*time.Duration(retries)) + //ctxrRcc, cancel := context.WithTimeout(context.TODO(), timeout*time.Duration(retries)) + //defer cancel() + ctxPrcc, cancel := chromedp.NewContext(context.Background()) defer cancel() - response := []string{} - // read config file - mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner")) - - // Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers). - //devt := devtool.New(mcfg["headless"]) - devt := devtool.New(mcfg.Headless) - - pt, err := devt.Get(ctx, devtool.Page) - if err != nil { - pt, err = devt.Create(ctx) - if err != nil { - log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) - repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") - repoStats.Inc(common.HeadlessError) - return response, err - } - } - // Initiate a new RPC connection to the Chrome DevTools Protocol target. - conn, err := rpcc.DialContext(ctx, pt.WebSocketDebuggerURL) + newPageConn, err := m.Dial(ctxPrcc, targetID) + log.Infof("headless context targetID: %s url: %s", targetID, url) if err != nil { - log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) - repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. newPageConn... Is Headless Container running?"}).Error(err) + repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. newPageConn... Is Headless Container running?") repoStats.Inc(common.HeadlessError) - return response, err + + return nil, err } - defer conn.Close() // Leaving connections open will leak memory. + //defer func(newPageConn *rpcc.Conn) { + // log.Info("NewPageConn defer") + // err := newPageConn.Close() + // if err != nil { + // log.WithFields(log.Fields{"url": urlloc, "issue": "error clocing connection"}).Error("PageRenderAndUpload ", urlloc, "::", err) + // + // } + //}(newPageConn) + + // context for page rendering. Separate from the rpcc.Conn + ctx, cancel := context.WithTimeout(context.TODO(), totalTimeout) + defer cancel() + ctx, cancel = chromedp.NewContext(ctx, chromedp.WithTargetID(target2.ID(targetID))) + defer cancel() + + client := cdp.NewClient(newPageConn) - c := cdp.NewClient(conn) + //ctx, cancel := context.WithTimeout(context.TODO(), timeout*time.Duration(retries)) + // + //// read config file + //mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner")) + // + //// Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers). + ////devt := devtool.New(mcfg["headless"]) + //devt := devtool.New(mcfg.Headless) + // + //pt, err := devt.Get(ctx, devtool.Page) + //if err != nil { + // pt, err = devt.Create(ctx) + // if err != nil { + // log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + // repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + // repoStats.Inc(common.HeadlessError) + // return response, err + // } + //} + // + //// Initiate a new RPC connection to the Chrome DevTools Protocol target. + //conn, err := rpcc.DialContext(ctx, pt.WebSocketDebuggerURL) + //if err != nil { + // log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + // repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?") + // repoStats.Inc(common.HeadlessError) + // return response, err + //} + //defer conn.Close() // Leaving connections open will leak memory. + // + //// attempt to use session. failed. + ////sessionclient := cdp.NewClient(conn) // conn created via rpcc.Dial. + ////m, err := session.NewManager(sessionclient) + ////if err != nil { + //// // Handle error. + ////} + ////defer m.Close() + //// + ////newPage, err := sessionclient.Target.CreateTarget(ctx, + //// target.NewCreateTargetArgs("about:blank")) + ////if err != nil { + //// log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?"}).Error(err) + //// repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?") + //// repoStats.Inc(common.HeadlessError) + //// return response, err + ////} + //// + ////// newPageConn uses the underlying conn without establishing a new + ////// websocket connection. + ////newPageConn, err := m.Dial(ctx, newPage.TargetID) + ////if err != nil { + //// log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. newPageConn... Is Headless Container running?"}).Error(err) + //// repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. newPageConn... Is Headless Container running?") + //// repoStats.Inc(common.HeadlessError) + //// return response, err + ////} + ////defer newPageConn.Close() + //// + ////c := cdp.NewClient(newPageConn) + // + //c := cdp.NewClient(conn) + c := client // Listen to Page events so we can receive DomContentEventFired, which // is what tells us when the page is done loading err = c.Page.Enable(ctx) @@ -259,6 +405,7 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge // Create the Navigate arguments with the optional Referrer field set. navArgs := page.NewNavigateArgs(url) nav, err := c.Page.Navigate(ctx, navArgs) + if err != nil { log.WithFields(log.Fields{"url": url, "issue": "Navigate To Headless"}).Error(err) repologger.WithFields(log.Fields{"url": url, "issue": "Navigate To Headless"}).Error(err) @@ -266,14 +413,22 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge return response, err } - _, err = loadEventFired.Recv() - if err != nil { - return nil, err + if loadEventReply, err := loadEventFired.Recv(); err != nil { + log.Trace(loadEventReply) + log.Errorf(" loadEventFired error original targetID: %s url: %s", targetID, url) + log.Errorf("loadEventFired error context TargetID: %s url: %s", ctx.Value("targetId"), url) + log.WithFields(log.Fields{"url": url, "issue": "Headless Load Event Error"}).Error(err) + repologger.WithFields(log.Fields{"url": url, "issue": "Headless Load Event Error"}).Error(err) + repoStats.Inc(common.HeadlessError) + return response, err } loadEventFired.Close() // Wait until we have a DOMContentEventFired event. - if _, err = domContent.Recv(); err != nil { + if contentReply, err := domContent.Recv(); err != nil { + log.Trace(contentReply) + log.Errorf(" domContent.Recv error original targetID: %s url: %s", targetID, url) + log.Errorf("domContent.Recv error context TargetID: %s url: %s", ctx.Value("targetId"), url) log.WithFields(log.Fields{"url": url, "issue": "Dom Error"}).Error(err) repologger.WithFields(log.Fields{"url": url, "issue": "Dom Error"}).Error(err) repoStats.Inc(common.HeadlessError) @@ -304,7 +459,7 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge resolve(metadata); } else { - reject("No JSON-LD present after {{timeout}} second."); + reject("No JSON-LD present after {{timeout}} milliseconds."); } }); } @@ -370,34 +525,39 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge repologger.WithFields(log.Fields{"url": url, "issue": "Multiple JSON"}).Debug(err) } for _, jsonld := range jsonlds { - valid, err := isValid(v1, jsonld) - if err != nil { - // there could be one bad jsonld, and one good. We want to process the jsonld - // so, do not set an err - log.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error("error checking for valid json :", err) - repologger.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error(err) - repoStats.Inc(common.Issues) - } else if valid && jsonld != "" { // traps out the root domain... should do this different + // just grab them all maybe we fix later in upload process. + if jsonld != "" { response = append(response, jsonld) - err = nil - // need to just return a list - - } else { - // there could be one bad jsonld, and one good. We want to process the jsonld - // so, do not set an err - log.Info("Empty JSON-LD document found. Continuing.", url) - repologger.WithFields(log.Fields{"url": url, "issue": "Empty JSON-LD document found"}).Debug() - repoStats.Inc(common.EmptyDoc) - // TODO Is here where to add an entry to the KV store - //err = db.Update(func(tx *bolt.Tx) error { - // b := tx.Bucket([]byte(k)) - // err := b.Put([]byte(url), []byte("NULL")) // no JOSN-LD found at this URL - // if err != nil { - // log.Error("Error writing to bolt", err) - // } - // return nil - //}) } + + //valid, err := isValid(v1, jsonld) + //if err != nil { + // // there could be one bad jsonld, and one good. We want to process the jsonld + // // so, do not set an err + // log.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error("error checking for valid json :", err) + // repologger.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error(err) + // repoStats.Inc(common.Issues) + //} else if valid && jsonld != "" { // traps out the root domain... should do this different + // response = append(response, jsonld) + // err = nil + // // need to just return a list + // + //} else { + // // there could be one bad jsonld, and one good. We want to process the jsonld + // // so, do not set an err + // log.Info("Empty JSON-LD document found. Continuing.", url) + // repologger.WithFields(log.Fields{"url": url, "issue": "Empty JSON-LD document found"}).Error() + // repoStats.Inc(common.EmptyDoc) + // // TODO Is here where to add an entry to the KV store + // //err = db.Update(func(tx *bolt.Tx) error { + // // b := tx.Bucket([]byte(k)) + // // err := b.Put([]byte(url), []byte("NULL")) // no JOSN-LD found at this URL + // // if err != nil { + // // log.Error("Error writing to bolt", err) + // // } + // // return nil + // //}) + //} } return response, err diff --git a/internal/summoner/acquire/headless_test.go b/internal/summoner/acquire/headless_test.go index 95ee0382..1d76ecd9 100644 --- a/internal/summoner/acquire/headless_test.go +++ b/internal/summoner/acquire/headless_test.go @@ -1,7 +1,15 @@ package acquire import ( + "context" + "github.com/chromedp/chromedp" "github.com/gleanerio/gleaner/internal/common" + "github.com/mafredri/cdp" + "github.com/mafredri/cdp/devtool" + "github.com/mafredri/cdp/protocol/target" + "github.com/mafredri/cdp/rpcc" + "github.com/mafredri/cdp/session" + log "github.com/sirupsen/logrus" "github.com/spf13/viper" "github.com/stretchr/testify/assert" "net/http" @@ -13,19 +21,19 @@ var HEADLESS_URL = "http://127.0.0.1:9222" func PingHeadless() (int, error) { var client = http.Client{ - Timeout: 2 * time.Second, + Timeout: 2 * time.Second, } req, err := http.NewRequest("HEAD", HEADLESS_URL, nil) - if err != nil { - return 0, err - } - resp, err := client.Do(req) - if err != nil { - return 0, err - } - resp.Body.Close() - return resp.StatusCode, nil + if err != nil { + return 0, err + } + resp, err := client.Do(req) + if err != nil { + return 0, err + } + resp.Body.Close() + return resp.StatusCode, nil } // need to have some test that checks if headless is actually running. @@ -33,7 +41,7 @@ func PingHeadless() (int, error) { func TestHeadlessNG(t *testing.T) { status, err := PingHeadless() - if(err != nil || status != 200) { + if err != nil || status != 200 { t.Skip("Skipping headless tests because no headless browser is running.") } @@ -47,7 +55,7 @@ func TestHeadlessNG(t *testing.T) { {name: "r2r_wait_5_works_returns_2_jsonld", url: "https://dev.rvdata.us/search/fileset/100135", jsonldcount: 2, - headlessWait: 5, + headlessWait: 20, }, {name: "r2r_expectedfail_wait_0_returns_1_jsonld_fails_if_2_jsonld", url: "https://dev.rvdata.us/search/fileset/100135", @@ -70,9 +78,58 @@ func TestHeadlessNG(t *testing.T) { for key, value := range conf { viper.Set(key, value) } + + ctx, cancel := chromedp.NewContext(context.TODO()) + defer cancel() + + // Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers). + //devt := devtool.New(mcfg["headless"]) + devt := devtool.New(HEADLESS_URL) + + pt, err := devt.Get(ctx, devtool.Page) + if err != nil { + pt, err = devt.Create(ctx) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + return + } + } + + // Initiate a new RPC connection to the Chrome DevTools Protocol target. + conn, err := rpcc.DialContext(ctx, pt.WebSocketDebuggerURL) + if err != nil { + log.WithFields(log.Fields{"issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err) + + return + } + defer conn.Close() + sessionclient := cdp.NewClient(conn) + manager, err := session.NewManager(sessionclient) + if err != nil { + // Handle error. + } + defer manager.Close() + args := target.NewCreateTargetArgs("") + //args.SetNewWindow(true) + newPage, err := sessionclient.Target.CreateTarget(ctx, + args) + if err != nil { + log.WithFields(log.Fields{"url": test.url, "issue": "Not REPO FAULT. NewCreateTargetArgs... Is Headless Container running?"}).Error(err) + + return + } + closeArgs := target.NewCloseTargetArgs(newPage.TargetID) + defer func(Target cdp.Target, ctx context.Context, args *target.CloseTargetArgs) { + log.Info("Close Target Defer") + _, err := Target.CloseTarget(ctx, args) + if err != nil { + log.WithFields(log.Fields{"url": test.url, "issue": "error closing target"}).Error("PageRenderAndUpload ", test.url, " ::", err) + + } + }(sessionclient.Target, ctx, closeArgs) repoLogger, _ := common.LogIssues(viper, test.name) t.Run(test.name, func(t *testing.T) { - jsonlds, err := PageRender(viper, 5*time.Second, test.url, test.name, repoLogger, runstats) + jsonlds, err := PageRender(viper, 60*time.Second, test.url, test.name, repoLogger, runstats, manager, newPage.TargetID) if !test.expectedFail { assert.Equal(t, test.jsonldcount, len(jsonlds)) } else { diff --git a/internal/summoner/summoner.go b/internal/summoner/summoner.go index e881816f..b8d5bd95 100644 --- a/internal/summoner/summoner.go +++ b/internal/summoner/summoner.go @@ -71,7 +71,8 @@ func Summoner(mc *minio.Client, v1 *viper.Viper) { // just report the error, and then run gathered urls if len(hru) > 0 { log.Info("running headless:") - acquire.HeadlessNG(v1, mc, hru, runStats) + //acquire.HeadlessNG(v1, mc, hru, runStats) + acquire.ResRetrieve(v1, mc, hru, runStats) } // Time report diff --git a/resoruces/jsonld/neotoma_samesha1_2.jsonld b/resoruces/jsonld/neotoma_samesha1_2.jsonld new file mode 100644 index 00000000..c83c6229 --- /dev/null +++ b/resoruces/jsonld/neotoma_samesha1_2.jsonld @@ -0,0 +1 @@ +{"@context":"https://schema.org","@type":"Dataset","license":"https://creativecommons.org/licenses/by/4.0/deed.en_US","name":"15/2 Pollen surface sample dataset","description":"Landing page for Pollen surface sample data from 15/2, including data download options and linked resources.","includedInDataCatalog":{"@type":"DataCatalog","name":"North American Pollen Database","about":"Paleoecology","publication":"Williams JW, et al. (2018). The Neotoma Paleoecology Database, a multiproxy, international, community-curated data resource. Quaternary Research, 89(1), 156-177.","publisher":{"@type":"Organization","name":"Neotoma Paleoecological Database","alternateName":"Neotoma","description":"The Neotoma Paleoecology Database and Community is an online hub for data, research, education, and discussion about paleoenvironments.","url":"https://neotomadb.org"},"funder":{"@type":"Organization","name":"National Sciences Foundation","alternateName":"NSF","url":"https://nsf.gov"},"isAccessibleForFree":true},"about":"","distribution":{"@type":"DataDownload","contentUrl":"https://api.neotomadb.org/v2.0/data/downloads/2","datePublished":"2018-02-02 14:24:27","inLanguage":"en","encodingFormat":"application/json"},"spatialCoverage":{"@type":"Place","name":"15/2","geo":{"@type":"GeoCoordinates","latitude":55.83333,"longitude":-75.01667,"elevation":305}}} \ No newline at end of file diff --git a/resoruces/jsonld/noetoma_samesha1_47.jsonld b/resoruces/jsonld/noetoma_samesha1_47.jsonld new file mode 100644 index 00000000..6303afd2 --- /dev/null +++ b/resoruces/jsonld/noetoma_samesha1_47.jsonld @@ -0,0 +1 @@ +{"@context":"https://schema.org","@type":"Dataset","license":"https://creativecommons.org/licenses/by/4.0/deed.en_US","name":"Liberty Pollen surface sample dataset","description":"Landing page for Pollen surface sample data from Liberty, including data download options and linked resources.","includedInDataCatalog":{"@type":"DataCatalog","name":"North American Pollen Database","about":"Paleoecology","publication":"Williams JW, et al. (2018). The Neotoma Paleoecology Database, a multiproxy, international, community-curated data resource. Quaternary Research, 89(1), 156-177.","publisher":{"@type":"Organization","name":"Neotoma Paleoecological Database","alternateName":"Neotoma","description":"The Neotoma Paleoecology Database and Community is an online hub for data, research, education, and discussion about paleoenvironments.","url":"https://neotomadb.org"},"funder":{"@type":"Organization","name":"National Sciences Foundation","alternateName":"NSF","url":"https://nsf.gov"},"isAccessibleForFree":true},"about":"","distribution":{"@type":"DataDownload","contentUrl":"https://api.neotomadb.org/v2.0/data/downloads/47","datePublished":"2018-02-02 14:24:27","inLanguage":"en","encodingFormat":"application/json"},"spatialCoverage":{"@type":"Place","name":"Liberty","geo":{"@type":"GeoCoordinates","latitude":43.52,"longitude":-90.78,"elevation":305}}} \ No newline at end of file diff --git a/resoruces/jsonld/ssdbiodp_1.jsonld b/resoruces/jsonld/ssdbiodp_1.jsonld new file mode 100644 index 00000000..1eb11b98 --- /dev/null +++ b/resoruces/jsonld/ssdbiodp_1.jsonld @@ -0,0 +1,2 @@ + + {"@context":"http:\/\/schema.org\/","@id":"https:\/\/ssdb.iodp.org\/secureGetObj.php?id=116112","@type":"Dataset","name":"Undefined","license":"TBD","datePublished":"2006-11-30","description":"Files from the Site Survey Data Bank (ssdb.iodp.org) Scanned from archived paper original. Related Activity: Expedition: SO149\n","distribution":{"@type":"DataDownload","contentUrl":"https:\/\/ssdb.iodp.org\/secureGetObj.php?id=116112","encodingFormat":"image\/tiff","description":"registered PDB\/SSDB account required; this is free and can be obtained by registering at proposals.iodp.org"},"creator":{"@type":"Person","name":"University of California, Santa Cruz"},"spatialCoverage":{"@type":"Place","geo":{"@type":"GeoShape","box":"47.8333,-128.633 47.9667,-128.5"}},"ssdb":{"sitenames":["FR-1A"],"datatype":"Navigation Data","proposal":"P545","format":"TIFF","AccessControl":"release","status":3,"georeference":{"type":"Polygon","coordinates":[[[-128.632995605,47.833301544],[-128.632995605,47.966701508],[-128.5,47.966701508],[-128.5,47.833301544],[-128.632995605,47.833301544]]]}}} diff --git a/resoruces/jsonld/ssdbiodp_2.jsonld b/resoruces/jsonld/ssdbiodp_2.jsonld new file mode 100644 index 00000000..0e239be8 --- /dev/null +++ b/resoruces/jsonld/ssdbiodp_2.jsonld @@ -0,0 +1,2 @@ + + {"@context":"http:\/\/schema.org\/","@id":"https:\/\/ssdb.iodp.org\/secureGetObj.php?id=125101","@type":"Dataset","name":"LAKO_1043.sgy","license":"TBD","datePublished":"2019-11-12","description":"Files from the Site Survey Data Bank (ssdb.iodp.org) High resolution 2D multichannel seismic reflection data from LAKO 2019 expedition.","distribution":{"@type":"DataDownload","contentUrl":"https:\/\/ssdb.iodp.org\/secureGetObj.php?id=125101","encodingFormat":"application\/octet-stream","description":"registered PDB\/SSDB account required; this is free and can be obtained by registering at proposals.iodp.org"},"creator":{"@type":"Person","name":"Paul Knutz"},"spatialCoverage":{"@type":"Place","geo":{"@type":"GeoShape","line":"73.85093,-62.18863 74.2395,-61.54886"}},"ssdb":{"sitenames":["MB-04C"],"datatype":"Seismic SEGY","proposal":"P909","format":"SEGY","AccessControl":"release","status":3,"georeference":{"type":"LineString","coordinates":[[-62.1886,73.8509],[-61.5489,74.2395]]}}} diff --git a/resoruces/jsonld/unavco_1.jsonld b/resoruces/jsonld/unavco_1.jsonld new file mode 100644 index 00000000..7e42c7da --- /dev/null +++ b/resoruces/jsonld/unavco_1.jsonld @@ -0,0 +1,336 @@ + +{ + "@context":{"@vocab":"https://schema.org/"}, + "@type":"Dataset", + "name":"Turkana GPS Network", + "description":"GPS/GNSS stations: Long-term continuous or semi-continuous occupations at multiple locations", + "@id":"https://doi.org/10.7283/WF02-PP16", + "url":"https://doi.org/10.7283/WF02-PP16", + "identifier":"10.7283/WF02-PP16", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.unavco.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + }, + "includedInDataCatalog":{ + "@type":"DataCatalog", + "name":"unavco.org" + }, + "hasPart":[ + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/T5W66J53", + "url":"https://doi.org/10.7283/T5W66J53", + "name":"Aggregate dataset member 1) 2017-02-22 - 2021-01-25: Turkwel, Kenya - XTBT-Turkwel Turkana P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/4XDR-FW31", + "url":"https://doi.org/10.7283/4XDR-FW31", + "name":"Aggregate dataset member 2) 2018-02-12 - 2021-01-23: Turkana GPS Network - XTBI-Ileret Turkana P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/CSVF-CN55", + "url":"https://doi.org/10.7283/CSVF-CN55", + "name":"Aggregate dataset member 3) 2019-01-18 - 2021-01-31: Turkana GPS Network - XTRM-Turmi P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/9RS1-K054", + "url":"https://doi.org/10.7283/9RS1-K054", + "name":"Aggregate dataset member 4) 2019-01-19 - 2021-02-02: Turkana GPS Network - XJNK-Jinka P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/X7VZ-T682", + "url":"https://doi.org/10.7283/X7VZ-T682", + "name":"Aggregate dataset member 5) 2019-01-21 - 2021-01-30: Turkana GPS Network - XYAB-Yabelo P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/BCJ8-6988", + "url":"https://doi.org/10.7283/BCJ8-6988", + "name":"Aggregate dataset member 6) 2019-01-23 - 2021-01-30: Turkana GPS Network - XMOY-Moyale P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/JB3C-0166", + "url":"https://doi.org/10.7283/JB3C-0166", + "name":"Aggregate dataset member 7) 2019-01-24 - 2021-01-31: Turkana GPS Network - XTEL-Teltele P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/BJ0E-YB91", + "url":"https://doi.org/10.7283/BJ0E-YB91", + "name":"Aggregate dataset member 8) 2019-02-22 - 2021-01-20: Turkana GPS Network - XSAS-Marsabit Sasura Girls School P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/GWQR-N363", + "url":"https://doi.org/10.7283/GWQR-N363", + "name":"Aggregate dataset member 9) 2019-02-23 - 2021-01-19: Turkana GPS Network - XHOR-North Horr P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/SVT7-GR73", + "url":"https://doi.org/10.7283/SVT7-GR73", + "name":"Aggregate dataset member 10) 2019-02-24 - 2021-01-20: Turkana GPS Network - XLOY-El Molo Bay P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + }, + { + "@type":"Dataset", + "@id":"https://doi.org/10.7283/9CVK-3A35", + "url":"https://doi.org/10.7283/9CVK-3A35", + "name":"Aggregate dataset member 11) 2019-02-24 - 2021-01-25: Turkana GPS Network - XLOK-Lokichogio P.S.", + "description":"Part of aggregate dataset: Turkana GPS Network.", + "license":"https://creativecommons.org/licenses/by/4.0/", + "creator":{ + "@type":"Organization", + "url":"https://www.earthscope.org/", + "name":"The GAGE Facility", + "@id": "https://ror.org/04danrt76", + "contactPoint":{ + "@type":"ContactPoint", + "contactType": "support service", + "email":"data-help@earthscope.org" + } + } + } + ], + "distribution":[ + { + "@type":"DataDownload", + "encodingFormat":"CSV", + "contentUrl":"" + } + ], + "citation":"Bendick, Rebecca, Knappe, Ellen, Bastow, Ian, Ebinger, Cynthia J., Mariita, Nicholas, Kianji, Gladys, Nengo, Isaiah, 2017, Turkana GPS Network, The GAGE Facility operated by EarthScope Consortium, GPS/GNSS Observations (Aggregation of Multiple Datasets), https://doi.org/10.7283/WF02-PP16", + "temporalCoverage":"2017-02-22/2021-02-02", + "spatialCoverage":[ + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"2.2865", + "longitude":"38.0853" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"2.8587", + "longitude":"36.7001" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"3.1393", + "longitude":"35.8669" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"3.3124", + "longitude":"37.0597" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"3.5450", + "longitude":"39.0399" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"4.2070", + "longitude":"34.3438" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"4.2858", + "longitude":"36.2622" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"4.8823", + "longitude":"38.0973" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"4.9749", + "longitude":"36.4797" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"5.0306", + "longitude":"37.3747" + } + }, + { + "@type":"Place", + "geo":{ + "@type":"GeoCoordinates", + "latitude":"5.7557", + "longitude":"36.5950" + } + } + ] +} diff --git a/runConfigurations/glcon config gnerate test.run.xml b/runConfigurations/glcon config gnerate test.run.xml new file mode 100644 index 00000000..e7faab7c --- /dev/null +++ b/runConfigurations/glcon config gnerate test.run.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/runConfigurations/gleaner batch headless.run.xml b/runConfigurations/gleaner batch headless.run.xml new file mode 100644 index 00000000..7bd7137b --- /dev/null +++ b/runConfigurations/gleaner batch headless.run.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/runConfigurations/gleaner batch mxied.run.xml b/runConfigurations/gleaner batch mxied.run.xml new file mode 100644 index 00000000..82783174 --- /dev/null +++ b/runConfigurations/gleaner batch mxied.run.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/runConfigurations/go test gleaner.run.xml b/runConfigurations/go test gleaner.run.xml index 9669d5e9..24fe579a 100644 --- a/runConfigurations/go test gleaner.run.xml +++ b/runConfigurations/go test gleaner.run.xml @@ -9,10 +9,11 @@