Skip to content

Commit

Permalink
Merge branch '239_dev_headless' into dev_ec
Browse files Browse the repository at this point in the history
  • Loading branch information
valentinedwv committed Sep 11, 2023
2 parents 6fc0773 + 02601d5 commit eeb7726
Show file tree
Hide file tree
Showing 28 changed files with 1,583 additions and 174 deletions.
2 changes: 1 addition & 1 deletion cmd/husker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func main() {

runStats := common.NewRunStats()
repostats := runStats.Add(k)
err = acquire.PageRenderAndUpload(v1, mc, 45*time.Second, url, k, rlogginer, repostats)
err = acquire.PageRenderAndUpload(v1, mc, 45*time.Second, url, k, rlogginer, repostats, nil, "")
if err != nil {
panic(fmt.Errorf("error when reading config: %v", err))
}
Expand Down
66 changes: 66 additions & 0 deletions configs/test_sources/gleaner
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
context:
cache: true
contextmaps:
- file: ./configs/schemaorg-current-https.jsonld
prefix: https://schema.org/
- file: ./configs/schemaorg-current-https.jsonld
prefix: http://schema.org/
gleaner:
mill: true
runid: runX
summon: true
millers:
graph: true
minio:
address: oss.geocodes-aws-dev.earthcube.org
port: 443
ssl: true
bucket: dvtest
region: ""
accesskey: worldsbestaccesskey
secretkey: worldsbestsecretkey
sources:
- sourcetype: sitemap
name: headless
logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
url: https://earthcube.github.io/GeoCODES-Metadata/site/sitemaps/headless.xml
headless: true
pid: https://www.re3data.org/repository/r3d100010655
propername: TEST HEADLESS SOURCES
domain: http://wwwearthcube.org/headless/
active: true
credentialsfile: ""
other: {}
headlesswait: 0
delay: 0
identifierpath: ' "$.distribution.contentUrl"'
apipagelimit: 0
identifiertype: identifiersha
fixcontextoption: 0
acceptcontenttype: application/ld+json, text/html
jsonprofile: application/ld+json
- sourcetype: sitemap
name: mixed
logo: http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png
url: http://ds.iris.edu/files/sitemap.xml
headless: false
pid: https://www.re3data.org/repository/r3d100010268
propername: TEST MIXED SOURCES
domain: http://wwwearthcube.org/headless/
active: true
credentialsfile: ""
other: {}
headlesswait: 0
delay: 0
identifierpath: ""
apipagelimit: 0
identifiertype: identifiersha
fixcontextoption: 0
acceptcontenttype: application/ld+json, text/html
jsonprofile: application/ld+json
summoner:
after: ""
delay: null
headless: http://127.0.0.1:9222
mode: full
threads: 5
42 changes: 42 additions & 0 deletions configs/test_sources/gleaner_base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
minio:
address: 0.0.0.0
port: 9000
accessKey: worldsbestaccesskey
secretKey: worldsbestsecretkey
ssl: false
bucket: gleaner
gleaner:
runid: runX # this will be the bucket the output is placed in...
summon: true # do we want to visit the web sites and pull down the files
mill: true
context:
cache: true
contextmaps:
- prefix: "https://schema.org/"
file: "./configs/schemaorg-current-https.jsonld"
- prefix: "http://schema.org/"
file: "./configs/schemaorg-current-https.jsonld"
summoner:
after: "" # "21 May 20 10:00 UTC"
mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing
threads: 5
delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1)
headless: http://127.0.0.1:9222 # URL for headless see docs/headless
millers:
graph: true
# will be built from sources.csv
#sitegraphs:
#- name: aquadocs
# url: https://oih.aquadocs.org/aquadocs.json
# headless: false
# pid: http://hdl.handle.net/1834/41372
# properName: AquaDocs
# domain: https://aquadocs.org
#sitemaps:
#- name: samplesearth
# url: https://samples.earth/sitemap.xml
# headless: false
# pid: https://www.re3data.org/repository/samplesearth
# properName: Samples Earth (DEMO Site)
# domain: https://samples.earth
26 changes: 26 additions & 0 deletions configs/test_sources/localConfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
minio:
address: oss.geocodes-aws-dev.earthcube.org
port: 443
accessKey: worldsbestaccesskey
secretKey: worldsbestsecretkey
ssl: true
bucket: dvtest # can be overridden with MINIO_BUCKET
sparql:
endpoint: https://graph.geocodes-dev.earthcube.org/blazegraph/namespace/earthcube/sparql
s3:
bucket: dvtest # sync with above... can be overridden with MINIO_BUCKET... get's zapped if it's not here.
domain: us-east-1

#headless field in gleaner.summoner
headless: http://127.0.0.1:9222
sourcesSource:
type: csv
location: sources.csv
#location: https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv
# this can be a remote csv
# type: csv
# location: https://docs.google.com/spreadsheets/d/{key}/gviz/tq?tqx=out:csv&sheet={sheet_name}
# TBD -- Just use the sources in the gleaner file.
# type: yaml
# location: gleaner.yaml
23 changes: 23 additions & 0 deletions configs/test_sources/nabu
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
minio:
address: oss.geocodes-aws-dev.earthcube.org
port: 443
ssl: true
bucket: dvtest
region: ""
accesskey: worldsbestaccesskey
secretkey: worldsbestsecretkey
objects:
bucket: dvtest
domain: us-east-1
prefix:
- summoned/headless
- summoned/mixed
- org
prefixoff: []
sparql:
endpoint: https://graph.geocodes-dev.earthcube.org/blazegraph/namespace/earthcube/sparql
authenticate: false
username: ""
password: ""
txtaipkg:
endpoint: http://0.0.0.0:8000
27 changes: 27 additions & 0 deletions configs/test_sources/nabu_base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
minio:
accesskey: worldsbestaccesskey
address: 0.0.0.0
bucket: gleaner
port: 9000
secretkey: worldsbestsecretkey
ssl: false
objects:
bucket: gleaner
domain: us-east-1
# prefix will be built using the sources.csv, and additional values
prefix:
- orgs
- summoned/obps
- prov/obps
- summoned/aquadocs
- prov/aquadocs
- milled/marinetraining
- prov/marinetraining
- milled/obis
- prov/obis
- milled/oceanexperts
- prov/oceanexperts
sparql:
endpoint: http://192.168.86.45:32775/blazegraph/namespace/lipd/sparql
txtaipkg:
endpoint: http://0.0.0.0:8000
3 changes: 3 additions & 0 deletions configs/test_sources/sources.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
hack,SourceType,Active,Name,ProperName,URL,Headless,IdentifierType,IdentifierPath,Domain,PID,Logo
3,sitemap,TRUE,headless,TEST HEADLESS SOURCES,https://earthcube.github.io/GeoCODES-Metadata/site/sitemaps/headless.xml,TRUE,identifiersha," ""$.distribution.contentUrl""",http://wwwearthcube.org/headless/,https://www.re3data.org/repository/r3d100010655,https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
4,sitemap,TRUE,mixed,TEST MIXED SOURCES,http://ds.iris.edu/files/sitemap.xml,FALSE,identifiersha,,http://wwwearthcube.org/headless/,https://www.re3data.org/repository/r3d100010268,http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png
13 changes: 9 additions & 4 deletions internal/common/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,12 @@ func GenerateNormalizedSha(v1 *viper.Viper, jsonld string) (Identifier, error) {
if uuid == "" {
// error
log.Error("ERROR: uuid generator:", "Action: Getting normalized sha Error:", err)
id = Identifier{}
id, _ = GenerateFileSha(v1, jsonld)
//id = Identifier{
// UniqueId: uuid,
// IdentifierType: config.FileSha,
// JsonSha: uuid,
//}
} else if err != nil {
// no error, then normalized triples generated
log.Info(" Action: Normalize sha generated sha:", uuid, " Error:", err)
Expand Down Expand Up @@ -234,13 +239,13 @@ func GenerateFileSha(v1 *viper.Viper, jsonld string) (Identifier, error) {
log.Error("ERROR: uuid generator:", "Action: Getting file sha")
id = Identifier{}
}
log.Debug(" Action: Json sha generated", uuid)
log.Debug(" Action: file sha generated", uuid)
id = Identifier{UniqueId: uuid,
IdentifierType: config.JsonSha,
IdentifierType: config.FileSha,
JsonSha: uuid,
}

log.Trace("jsonsha: ", uuid)
log.Trace("filesha: ", uuid)
// fmt.Println("\njsonsha:", id)
return id, nil
}
106 changes: 106 additions & 0 deletions internal/common/identifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,77 @@ sources:
}
}

// this always uses configTypes.IdentifierSha.
func testGenerateIdentifierFallthrough(tests []jsonexpectations, t *testing.T) {

//mock configre file
// paths are relative to the code
var vipercontext = []byte(`
context:
cache: true
contextmaps:
- file: ../../configs/schemaorg-current-https.jsonld
prefix: https://schema.org/
- file: ../../configs/schemaorg-current-https.jsonld
prefix: http://schema.org/
sources:
- sourcetype: sitemap
name: test
logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
url: https://opentopography.org/sitemap.xml
headless: false
pid: https://www.re3data.org/repository/r3d100010655
propername: OpenTopography
domain: http://www.opentopography.org/
active: false
credentialsfile: ""
other: {}
headlesswait: 0
delay: 0
IdentifierType: identifiersha
`)

for _, test := range tests {
for i, json := range test.json {
// needs to be defiend in the loop, so that each run has it's own configuration.
// otherwise changing the sources information in a multi-threaded ent has issues
viperVal := viper.New()
viperVal.SetConfigType("yaml")
viperVal.ReadConfig(bytes.NewBuffer(vipercontext))
sources, err := configTypes.GetSources(viperVal)

if err != nil {
assert.Fail(t, err.Error())
}

s := sources[0]
s.IdentifierType = configTypes.IdentifierSha
s.IdentifierPath = test.IdentifierPaths
t.Run(fmt.Sprint(test.name, "_", i), func(t *testing.T) {
if test.ignore {
return
}
path := filepath.Join("testdata", "identifier", json)
assert.FileExistsf(t, path, "Datafile Missing: {path}")
source, err := os.ReadFile(path)
if err != nil {
t.Fatal("error reading source file:", err)
}
result, err := GenerateIdentifier(viperVal, s, string(source))
//valStr := fmt.Sprint(result.uniqueId)
assert.Equal(t, test.expected, result.UniqueId, "uuid faild")
assert.Equal(t, test.expectedPath, result.MatchedPath, "matched path failed")
assert.Equal(t, test.IdentifierType, result.IdentifierType, "identifier failed")
if test.errorExpected {
assert.NotNil(t, err)
} else {
assert.Nil(t, err)
}

})
}
}
}
func TestGenerateFileShaIdentifier(t *testing.T) {

var tests = []jsonexpectations{
Expand Down Expand Up @@ -628,3 +699,38 @@ func TestValidJsonPathGraphInput(t *testing.T) {

testValidJsonPaths(tests, t)
}

func TestGenerateBadIdentifier(t *testing.T) {

var tests = []jsonexpectations{
// default
// should work for all

{
name: "extra_slash_jsonsha",
json: map[string]string{
"bad_json": "bad_json.jsonld",
},
errorExpected: true,
IdentifierType: configTypes.FileSha,
IdentifierPaths: "",
expected: "cc8c395af8163203dda2c6bee35fd728071bd809",
expectedPath: "",
ignore: false,
},
//{
// name: "extra_slash_identifier",
// json: map[string]string{
// "bad_json": "bad_json.jsonld",
// },
// errorExpected: false,
// IdentifierType: configTypes.FileSha,
// IdentifierPaths: "",
// expected: "cc8c395af8163203dda2c6bee35fd728071bd809",
// expectedPath: "",
// ignore: false,
//},
}

testGenerateIdentifierFallthrough(tests, t)
}
1 change: 1 addition & 0 deletions internal/common/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func InitLogging() {

log.SetFormatter(&log.JSONFormatter{}) // Log as JSON instead of the default ASCII formatter.
log.SetReportCaller(true) // include file name and line number
log.SetLevel(log.InfoLevel)
mw := io.MultiWriter(os.Stdout, logFile)
log.SetOutput(mw)
//log.SetOutput(logFile)
Expand Down
4 changes: 4 additions & 0 deletions internal/common/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ const EmptyDoc string = "SitemapEmptyDoc"
const Stored string = "SitemapStored"
const StoreError string = "SitemapStoredError"
const HeadlessError string = "HeadlessServerError"
const NotAuthorized string = "NotAuthorized"
const BadUrl string = "BadURL404"
const RepoServerError string = "RepoServerError"
const GenericIssue = "GenericUrlIssue"

// Inc increments the counter for the given key.
func (c *RepoStats) Inc(key string) {
Expand Down
Loading

0 comments on commit eeb7726

Please sign in to comment.