diff --git a/.github/workflows/contanerize.yaml b/.github/workflows/contanerize.yaml
index ef3ba1b7..b4bad35f 100644
--- a/.github/workflows/contanerize.yaml
+++ b/.github/workflows/contanerize.yaml
@@ -9,8 +9,11 @@ on:
branches:
- main
- dev
+ - dv_dev
- dev_eco
- # - dev_dagster142
+ - v0_generated_code
+ - 133_dev_sitemaps
+ - 151-integrate-community-stats-codes
tags:
- "v*.*.*"
@@ -154,7 +157,44 @@ jobs:
type=ref,event=branch
type=semver,pattern={{version}}
type=sha
-
+ build_code_workflows:
+ name: Dockerize Scheduler Workflows base
+ runs-on: ubuntu-latest
+ #strategy:
+ #matrix:
+ # project: [ "eco" ]
+ #project: [ "eco", "iow", "oih" ]
+ #platform: ["linux/amd64","linux/arm64"]
+ #platform: ["linux/amd64"] #linux/arm64 issues with building
+ steps:
+ - name: Set variables
+ run: |
+ REGISTRY_IMAGE=nsfearthcube/dagster-gleanerio-workflows
+ echo "REGISTRY_IMAGE=$REGISTRY_IMAGE" >> $GITHUB_ENV
+ working-directory: /
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v2
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v2
+ - name: Login to DockerHub
+ uses: docker/login-action@v2
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+ - name: Extract metadata (tags, labels) for Docker
+ id: meta
+ uses: docker/metadata-action@v4
+ with:
+ images: ${{ env.REGISTRY_IMAGE }}
+ flavor: |
+ latest=true
+ tags: |
+ type=ref,event=tag
+ type=ref,event=branch
+ type=semver,pattern={{version}}
+ type=sha
# - name: Set up Python 3.10
# uses: actions/setup-python@v4
# with:
@@ -201,7 +241,7 @@ jobs:
build-args:
implnet=${{ matrix.project }}
#file: ./dagster/implnets/build/Dockerfile
- file: ./build/Dockerfile_code
+ file: ./build/Dockerfile_workflows
context: "{{defaultContext}}:dagster/implnets"
tags: ${{ steps.meta.outputs.tags }}
# tags: nsfearthcube/ec_facets_client:latest
diff --git a/.gitignore b/.gitignore
index 75daa638..ae7c143f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,10 @@ venv/**
/dagster/.telemetry/
/dagster/.telemetry/
.env
+
+/dagster/implnets/generatedCode/implnet-*/output/
+
+/dagster/implnets/deployment/prod.env
+
+**/tmp**
+/dagster/dagster_home/
diff --git a/.idea/misc.xml b/.idea/misc.xml
index dd0668a7..fb9c2841 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,7 @@
-
-
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/scheduler.iml b/.idea/scheduler.iml
index 49c6a476..1f267fea 100644
--- a/.idea/scheduler.iml
+++ b/.idea/scheduler.iml
@@ -4,11 +4,11 @@
-
+
-
+
\ No newline at end of file
diff --git a/NOTES.md b/NOTES.md
index d3ce74cd..3864eda4 100644
--- a/NOTES.md
+++ b/NOTES.md
@@ -1,5 +1,8 @@
# Notes
+need to do dynamic ops.assets
+https://medium.com/@thegreat.rashid83/dagster-sensors-partition-c7a5205d4c0d
+
## Development
At the top level (dagster/implents) you can run
@@ -28,3 +31,11 @@ will run just the task, and in editable form, i think.
## Some articles to review
[Medium on Dagster with configurable API and asset examples](https://medium.com/@alexandreguitton_12701/notes-1-2-dagster-data-orchestrator-hands-on-2af6772b13d9)
+
+## Troubleshooting.
+Keep the python versions in the DOCKER definitions in sync. GRPCC can be finicky
+
+aka:
+
+`FROM python:3.11-slim`
+
diff --git a/README.md b/README.md
index 93bf8217..ba39b135 100644
--- a/README.md
+++ b/README.md
@@ -10,5 +10,8 @@ structured data on the web.
Details of the approach can be found in the [github io](https://earthcube.github.io/scheduler/).
+NOTE: Generate code brach v0_generated_code branch
+This is the original code that utilized a generate code approach to build the workflows.
+v0_generated_code is where gleaner and nabu config file updates should be done when using the original code
diff --git a/dagster/dagster_home/.gitkeep b/dagster/dagster_home/.gitkeep
new file mode 100644
index 00000000..79083c9a
--- /dev/null
+++ b/dagster/dagster_home/.gitkeep
@@ -0,0 +1 @@
+This is a place where dagster.yamls can be kept for runs
diff --git a/dagster/dagster_home/dagster.yaml b/dagster/dagster_home/dagster.yaml
new file mode 100644
index 00000000..35033656
--- /dev/null
+++ b/dagster/dagster_home/dagster.yaml
@@ -0,0 +1,23 @@
+local_artifact_storage:
+ module: dagster.core.storage.root
+ class: LocalArtifactStorage
+ config:
+ base_dir: /Users/valentin/development/dev_earthcube/scheduler/dagster/dagster_home/
+run_coordinator:
+ module: dagster.core.run_coordinator
+ class: QueuedRunCoordinator
+ config:
+ max_concurrent_runs: 6
+ # getting tags by copying from UI
+ tag_concurrency_limits:
+ - key: "ingest"
+ value: "docker"
+ limit: 3
+ - key: "ingest"
+ value: "report"
+ limit: 2
+ - key: "tenant_load"
+ value: "graph"
+ limit: 1
+telemetry:
+ enabled: false
diff --git a/dagster/implnets/Makefile b/dagster/implnets/Makefile
index 9f4778bb..a3f7aa4b 100644
--- a/dagster/implnets/Makefile
+++ b/dagster/implnets/Makefile
@@ -2,6 +2,14 @@
.SHELLFLAGS += -e
VERSION :=`cat VERSION`
+# ---- workflows ----
+# no code generation is neede for workflows
+
+wf-build:
+ podman build --tag="docker.io/fils/dagster_wf:$(VERSION)" --build-arg implnet=eco --file=./build/Dockerfile_workflows .
+
+wf-push:
+ podman push docker.io/fils/dagster_wf:$(VERSION)
# ---- ECO ----
diff --git a/dagster/implnets/VERSION b/dagster/implnets/VERSION
index 9c3f756d..6e8bf73a 100644
--- a/dagster/implnets/VERSION
+++ b/dagster/implnets/VERSION
@@ -1 +1 @@
-0.0.67
+0.1.0
diff --git a/dagster/implnets/__init__.py b/dagster/implnets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/build/Dockerfile_code b/dagster/implnets/build/Dockerfile_code
index 1c6cf0a3..8b82293d 100644
--- a/dagster/implnets/build/Dockerfile_code
+++ b/dagster/implnets/build/Dockerfile_code
@@ -12,10 +12,11 @@ RUN mkdir -p /usr/src/app/workflows
RUN pip install --upgrade pip
## this is a base for the project. Build this 'layer' first
-COPY ./requirements_code.txt requirements.txt
+COPY ./requirements.txt requirements.txt
RUN pip install -r requirements.txt
# this add the code
+# this is only needed because we generate the code with pygen. otherwise added in compose-project.yaml docker compose
COPY . scheduler
COPY ./configs/${implnet}/gleanerconfig.yaml scheduler/gleanerconfig.yaml
diff --git a/dagster/implnets/build/Dockerfile_local b/dagster/implnets/build/Dockerfile_local
index acd36f34..98f40589 100644
--- a/dagster/implnets/build/Dockerfile_local
+++ b/dagster/implnets/build/Dockerfile_local
@@ -25,4 +25,4 @@ WORKDIR /usr/src/app
ENV DAGSTER_HOME=/usr/src/app
-CMD ["dagster-webserver", "-w", "./project/${implnet}/workspace.yaml", "-h", "0.0.0.0", "-p", "3000"]
+CMD [ "dagster", "api","grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "workflows.tasks.tasks", "-d", "/usr/src/app/"]
diff --git a/dagster/implnets/build/Dockerfile_workflows b/dagster/implnets/build/Dockerfile_workflows
new file mode 100644
index 00000000..24a70764
--- /dev/null
+++ b/dagster/implnets/build/Dockerfile_workflows
@@ -0,0 +1,43 @@
+FROM python:3.11-slim
+
+
+# this file no longer needs to generate code. It will just include the base
+# it will run ingest by default
+# we may want to get an unreleased version of code, so this is needed
+
+RUN apt-get update && apt-get install -y git
+RUN pip install --upgrade pip
+RUN apt-get install -y gcc musl-dev python3-dev
+#RUN apt-get install libffi-dev
+# Read the ARG implnet to set who to build for.
+
+# docker buildandpush pulls the repo, so we need to put the code at a different location
+# this fails becaus the dagster/implnets files are not in the docker
+ARG implnet=eco
+
+RUN mkdir -p /usr/src/app/workflows
+
+
+## this is a base for the project. Build this 'layer' first
+COPY ./requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+# this add the code
+COPY . scheduler
+#COPY ./configs/${implnet}/gleanerconfig.yaml scheduler/gleanerconfig.yaml
+
+COPY ./deployment/dagster.yaml /usr/src/app/
+
+WORKDIR scheduler
+
+
+COPY ./workflows/ /usr/src/app/workflows
+
+
+
+# Change working directory
+WORKDIR /usr/src/app
+ENV DAGSTER_HOME=/usr/src/app
+
+
+CMD [ "dagster", "api","grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "workflows.tasks.tasks", "-d", "/usr/src/app/"]
diff --git a/dagster/implnets/configs/a_test/headless/.gitkeep b/dagster/implnets/configs/a_test/headless/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/configs/eco/gleanerconfig.yaml b/dagster/implnets/configs/eco/gleanerconfig.yaml
index 6ec78244..570da029 100644
--- a/dagster/implnets/configs/eco/gleanerconfig.yaml
+++ b/dagster/implnets/configs/eco/gleanerconfig.yaml
@@ -1,6 +1,6 @@
context:
cache: true
- strict: true
+ strict: false
contextmaps:
- prefix: "https://schema.org/"
file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld
@@ -44,9 +44,610 @@ sources:
identifierpath: ""
apipagelimit: 0
identifiertype: identifiersha
+ fixcontextoption: 1
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitegraph
+ name: aquadocs
+ logo: ""
+ url: https://oih.aquadocs.org/aquadocs.json
+ headless: false
+ pid: http://hdl.handle.net/1834/41372
+ propername: AquaDocs
+ domain: https://aquadocs.org
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: bcodmo
+ logo: https://www.bco-dmo.org/sites/all/themes/bcodmo/logo.png
+ url: https://www.bco-dmo.org/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100000012
+ propername: Biological and Chemical Oceanography Data Management Office
+ domain: http://www.bco-dmo.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: cchdo
+ logo: https://cchdo.ucsd.edu/static/svg/logo_cchdo.svg
+ url: https://cchdo.ucsd.edu/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010831
+ propername: CLIVAR and Carbon Hydrographic Data Office
+ domain: https://cchdo.ucsd.edu/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: datadiscoverystudio
+ logo: http://datadiscoverystudio.org/geoportal/images/DataDiscoveryStudioBufferedWhite.png
+ url: http://datadiscoverystudio.org/sitemap/CinergiSiteIndex.xml
+ headless: false
+ pid: ""
+ propername: ""
+ domain: http://datadiscoverystudio.org/geoportal
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: designsafe
+ logo: ""
+ url: https://www.designsafe-ci.org/sitemap.xml
+ headless: false
+ pid: ""
+ propername: ""
+ domain: https://www.designsafe-ci.org/data/browser/public/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: earthchem
+ logo: http://www.earthchem.org/sites/default/files/files/EC_0-1.png
+ url: https://ecl.earthchem.org/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100011538
+ propername: earthchem
+ domain: https://ecl.earthchem.org/home.php
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: $.sameAs
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: ecrr_examples
+ logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
+ url: https://earthcube.github.io/ecrro/Examples/sitemap.xml
+ headless: false
+ pid: http://www.earthcube.org/resourceregistry/examples
+ propername: Earthcube Resource Registry Examples
+ domain: http://www.earthcube.org/resourceregistry/examples
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifierstring
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: edi
+ logo: https://portal.edirepository.org/nis/images/EDI-logo-300DPI_5.png
+ url: https://portal.edirepository.org/sitemap_index.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010272
+ propername: Environmental Data Initiative
+ domain: 'http://environmentaldatainitiative.org/ '
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: geocodes_demo_datasets
+ logo: ""
+ url: https://earthcube.github.io/GeoCODES-Metadata/metadata/Dataset/allgood/sitemap.xml
+ headless: false
+ pid: https://github.com/earthcube/GeoCODES-Metadata/metadata/OtherResources
+ propername: Geocodes Demo Datasets
+ domain: https://www.earthcube.org/datasets/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: geocodes_examples
+ logo: ""
+ url: https://earthcube.github.io/GeoCODES-Metadata/metadata/Dataset/allgood/sitemap.xml
+ headless: false
+ pid: https://github.com/earthcube/GeoCODES-Metadata/
+ propername: GeoCodes Tools Examples
+ domain: https://github.com/earthcube/GeoCODES-Metadata/
+ active: true
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifierstring
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: hydroshare
+ logo: https://www.hydroshare.org/static/img/logo-lg.png
+ url: https://www.hydroshare.org/sitemap-resources.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100012625
+ propername: Consortium of Universities for the Advancement of Hydrologic Science, Inc. (CUAHSI)
+ domain: https://www.cuahsi.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: -1
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: iedadata
+ logo: https://www.iedadata.org/wp-content/themes/IEDA/assets/img/logo.png
+ url: http://get.iedadata.org/doi/xml-sitemap.php
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010578
+ propername: IEDA (Integrated Earth Data Applications)
+ domain: http://www.iedadata.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: $.sameAs
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: iris
+ logo: http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png
+ url: http://ds.iris.edu/files/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010268
+ propername: IRIS
+ domain: http://iris.edu
+ active: true
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: linkedearth
+ logo: http://wiki.linked.earth/wiki/images/thumb/5/51/EarthLinked_Banner_blue_NoShadow.jpg/440px-EarthLinked_Banner_blue_NoShadow.jpg
+ url: http://wiki.linked.earth/sitemap.xml
+ headless: false
+ pid: http://wiki.linked.earth
+ propername: Linked Earth
+ domain: http://wiki.linked.earth
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: -1
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: lipdverse
+ logo: ""
+ url: https://lipdverse.org/sitemap.xml
+ headless: false
+ pid: https://lipdverse.org
+ propername: Linked PaleoData
+ domain: https://lipdverse.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: magic
+ logo: http://mbobak.ncsa.illinois.edu/ext/ec/magic/MagIC.png
+ url: https://www2.earthref.org/MagIC/contributions.sitemap.xml
+ headless: true
+ pid: http://www.re3data.org/repository/r3d100011910
+ propername: Magnetics Information Consortium (MagIC)
+ domain: https://www.earthref.org/MagIC
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: $.sameAs
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: neon
+ logo: https://www.neonscience.org/themes/custom/neon/logo.svg
+ url: https://geodex.org/neon_prodcodes_sm.xml
+ headless: false
+ pid: http://www.re3data.org/repository/r3d100010290
+ propername: National Ecological Observatory Network (NEON)
+ domain: http://www.neonscience.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: neotomadb
+ logo: https://www.neotomadb.org/images/site_graphics/Packrat.png
+ url: http://data.neotomadb.org/sitemap.xml
+ headless: true
+ pid: http://www.re3data.org/repository/r3d100011761
+ propername: Neotoma
+ domain: http://www.neotomadb.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: opencoredata
+ logo: https://opencoredata.org/img/logo22small.png
+ url: http://opencoredata.org/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100012874
+ propername: opencoredata
+ domain: https://opencoredata.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: opentopography
+ logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
+ url: https://portal.opentopography.org/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010655
+ propername: OpenTopography
+ domain: http://www.opentopography.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: r2r
+ logo: https://www.rvdata.us/images/Logo.4b1519be.png
+ url: https://service-dev.rvdata.us/api/sitemap/
+ headless: true
+ pid: http://www.re3data.org/repository/r3d100010735
+ propername: Rolling Deck to Repository Program (R2R)
+ domain: https://www.rvdata.us/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 5
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: resource_registry
+ logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
+ url: https://object.cloud.sdsc.edu/v1/AUTH_85f46aa78936477d8e71b186269414e8/gleaner-summoned
+ headless: false
+ pid: ""
+ propername: Resource Registry
+ domain: http://www.earthcube.org/resourceregistry/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: $.@id
+ apipagelimit: 0
+ identifiertype: identifierstring
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: ssdbiodp
+ logo: http://ssdb.iodp.org/images/head_logo_PRO.gif
+ url: https://ssdb.iodp.org/dataset/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010267
+ propername: IODP Site Survey Databank
+ domain: https://ssdb.iodp.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: ucar
+ logo: https://opensky.ucar.edu/islandora/object/opensky%3Aucommunity/datastream/TN/view
+ url: https://data.ucar.edu/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010791
+ propername: UCAR
+ domain: https://data.ucar.edu
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: unavco
+ logo: https://www.unavco.org/lib/images/banner/uv-logo.png
+ url: https://www.unavco.org/data/doi/sitemap.xml
+ headless: false
+ pid: http://www.re3data.org/repository/r3d100010872
+ propername: UNAVCO
+ domain: http://www.unavco.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: unidata
+ logo: ""
+ url: https://www.unidata.ucar.edu/sitemap.xml
+ headless: false
+ pid: https://www.re3data.org/repository/r3d100010355
+ propername: UNIDATA
+ domain: http://www.unidata.ucar.edu/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: usapdc
+ logo: https://www.usap-dc.org/
+ url: https://www.usap-dc.org/view/dataset/sitemap.xml
+ headless: true
+ pid: https://www.re3data.org/repository/r3d100010660
+ propername: U.S. Antarctic Program Data Center
+ domain: https://www.usap-dc.org/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: wifire
+ logo: https://wifire-data.sdsc.edu/uploads/admin/2021-04-22-203649.712143WIFIRECOMMONSSMRES12.png
+ url: https://wifire-data.sdsc.edu/sitemap.xml
+ headless: false
+ pid: https://wifire-data.sdsc.edu/
+ propername: WIFIRE Commons
+ domain: https://wifire-data.sdsc.edu/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: hydrography90m
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/communityCollections/master/collection/hydrography90m/sitemaps/hydrography90m.xml
+ headless: false
+ pid: https://hydrography.org/hydrography90m/hydrography90m_layers
+ propername: Hydrography90m
+ domain: https://hydrography.org/hydrography90m/hydrography90m_layers
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
fixcontextoption: 0
acceptcontenttype: application/ld+json, text/html
jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: neoncontext:
+ cache: true
+ strict: false
+contextmaps:
+ - prefix: "https://schema.org/"
+ file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld
+ - prefix: "http://schema.org/"
+ file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld
+gleaner:
+ mill: true
+ runid: runX
+ summon: true
+summoner:
+ after: ""
+ delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1)
+ # will headless, the name of the container "service" work
+ headless: http://headless:9222
+ #headless: http://localhost:9222
+ mode: full
+ threads: 5
+millers:
+ graph: true
+minio:
+ address:
+ port:
+ ssl:
+ accessKey:
+ secretKey:
+ bucket:
+sources:
+ - sourcetype: sitemap
+ name: amgeo
+ logo: https://amgeo.colorado.edu/static/img/amgeosmall.svg
+ url: https://amgeo-dev.colorado.edu/sitemap.xml
+ headless: false
+ pid: ""
+ propername: Assimilative Mapping of Geospace Observations
+ domain: https://amgeo.colorado.edu/
+ active: false
+ credentialsfile: ""
+ other: { }
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 1
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
- sourcetype: sitegraph
name: aquadocs
logo: ""
@@ -131,7 +732,7 @@ sources:
pid: ""
propername: ""
domain: https://www.designsafe-ci.org/data/browser/public/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -150,7 +751,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100011538
propername: earthchem
domain: https://ecl.earthchem.org/home.php
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -164,12 +765,12 @@ sources:
- sourcetype: sitemap
name: ecrr_examples
logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
- url: https://raw.githubusercontent.com/earthcube/ecrro/master/Examples/sitemap.xml
+ url: https://earthcube.github.io/ecrro/Examples/sitemap.xml
headless: false
pid: http://www.earthcube.org/resourceregistry/examples
propername: Earthcube Resource Registry Examples
domain: http://www.earthcube.org/resourceregistry/examples
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -207,7 +808,7 @@ sources:
pid: https://github.com/earthcube/GeoCODES-Metadata/metadata/OtherResources
propername: Geocodes Demo Datasets
domain: https://www.earthcube.org/datasets/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -245,7 +846,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100012625
propername: Consortium of Universities for the Advancement of Hydrologic Science, Inc. (CUAHSI)
domain: https://www.cuahsi.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: -1
@@ -264,7 +865,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100010578
propername: IEDA (Integrated Earth Data Applications)
domain: http://www.iedadata.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -302,7 +903,7 @@ sources:
pid: http://wiki.linked.earth
propername: Linked Earth
domain: http://wiki.linked.earth
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: -1
@@ -340,7 +941,7 @@ sources:
pid: http://www.re3data.org/repository/r3d100011910
propername: Magnetics Information Consortium (MagIC)
domain: https://www.earthref.org/MagIC
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -378,7 +979,7 @@ sources:
pid: http://www.re3data.org/repository/r3d100011761
propername: Neotoma
domain: http://www.neotomadb.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -416,7 +1017,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100010655
propername: OpenTopography
domain: http://www.opentopography.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -473,7 +1074,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100010267
propername: IODP Site Survey Databank
domain: https://ssdb.iodp.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -511,7 +1112,7 @@ sources:
pid: http://www.re3data.org/repository/r3d100010872
propername: UNAVCO
domain: http://www.unavco.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -549,7 +1150,7 @@ sources:
pid: https://www.re3data.org/repository/r3d100010660
propername: U.S. Antarctic Program Data Center
domain: https://www.usap-dc.org/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -568,7 +1169,7 @@ sources:
pid: https://wifire-data.sdsc.edu/
propername: WIFIRE Commons
domain: https://wifire-data.sdsc.edu/
- active: true
+ active: false
credentialsfile: ""
other: { }
headlesswait: 0
@@ -579,4 +1180,191 @@ sources:
fixcontextoption: 0
acceptcontenttype: application/ld+json, text/html
jsonprofile: application/ld+json
-
+ - sourcetype: sitemap
+ name: hydrography90m
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/communityCollections/master/collection/hydrography90m/sitemaps/hydrography90m.xml
+ headless: false
+ pid: https://hydrography.org/hydrography90m/hydrography90m_layers
+ propername: Hydrography90m
+ domain: https://hydrography.org/hydrography90m/hydrography90m_layers
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: neon4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_neon4cast.xml
+ headless: false
+ pid: https://projects.ecoforecast.org/neon4cast-ci/
+ propername: NEON Ecological Forecast Challenge
+ domain: https://projects.ecoforecast.org/neon4cast-ci/
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: usgsrc4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_usgsrc4cast.xml
+ headless: false
+ pid: https://github.com/eco4cast/usgsrc4cast-ci
+ propername: Ecological Forecasting Initiative (EFI) and U.S. Geological Survey (USGS) River Chlorophyll Forecasting Challenge
+ domain: https://github.com/eco4cast/usgsrc4cast-ci
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: vera4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_vera4cast.xml
+ headless: false
+ pid: https://github.com/LTREB-reservoirs/vera4cast
+ propername: Virginia Ecoforecast Reservoir Analysis (VERA) Ecological Forecasting Challenge
+ domain: https://github.com/LTREB-reservoirs/vera4cast
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_neon4cast.xml
+ headless: false
+ pid: https://projects.ecoforecast.org/neon4cast-ci/
+ propername: NEON Ecological Forecast Challenge
+ domain: https://projects.ecoforecast.org/neon4cast-ci/
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: usgsrc4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_usgsrc4cast.xml
+ headless: false
+ pid: https://github.com/eco4cast/usgsrc4cast-ci
+ propername: Ecological Forecasting Initiative (EFI) and U.S. Geological Survey (USGS) River Chlorophyll Forecasting Challenge
+ domain: https://github.com/eco4cast/usgsrc4cast-ci
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: vera4cast
+ logo: ""
+ url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_vera4cast.xml
+ headless: false
+ pid: https://github.com/LTREB-reservoirs/vera4cast
+ propername: Virginia Ecoforecast Reservoir Analysis (VERA) Ecological Forecasting Challenge
+ domain: https://github.com/LTREB-reservoirs/vera4cast
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: osmc
+ logo: ""
+ url: https://osmc.noaa.gov/erddap/sitemap.xml
+ headless: false
+ pid: https://osmc.noaa.gov/erddap
+ propername: Easier access to scientific data
+ domain: https://osmc.noaa.gov/erddap
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: 0
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: obis
+ logo: https://obis.org/images/logo.png
+ url: https://obis-sitemaps.s3.amazonaws.com/sitemap_datasets.xml
+ headless: false
+ pid: https://catalogue.odis.org/view/343
+ propername: Ocean Biodiversity Information System (OBIS)
+ domain: https://obis.org
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: -1
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
+ - sourcetype: sitemap
+ name: geochemistry_custom
+ logo:
+ url: https://oss.geocodes.ncsa.illinois.edu/decoder/sitemaps/geochemistry_sitemap.xml
+ headless: false
+ pid: https://catalogue.odis.org/view/343
+ propername: Geochemistry Custom Datasets
+ domain: https://obis.org
+ active: true
+ credentialsfile: ""
+ other: {}
+ headlesswait: -1
+ delay: 0
+ identifierpath: ""
+ apipagelimit: 0
+ identifiertype: identifiersha
+ fixcontextoption: 0
+ acceptcontenttype: application/ld+json, text/html
+ jsonprofile: application/ld+json
\ No newline at end of file
diff --git a/dagster/implnets/configs/eco/nabuconfig.yaml b/dagster/implnets/configs/eco/nabuconfig.yaml
index 75ff07e4..3218d9a3 100644
--- a/dagster/implnets/configs/eco/nabuconfig.yaml
+++ b/dagster/implnets/configs/eco/nabuconfig.yaml
@@ -7,14 +7,34 @@ minio:
ssl: true
context:
cache: true
- strict: true
+ strict: false
contextmaps:
- prefix: "https://schema.org/"
file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld
- prefix: "http://schema.org/"
file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld
-sparql:
- endpoint:
+implementation_network:
+ orgname: eco
+endpoints:
+ - service: ec_blazegraph
+ baseurl: https://graph.geocodes-aws-dev.earthcube.org/blazegraph/namespace/test
+ type: blazegraph
+ authenticate: false
+ username:
+ password:
+ modes:
+ - action: sparql
+ suffix: /sparql
+ accept: application/sparql-results+json
+ method: GET
+ - action: update
+ suffix: /sparql
+ accept: application/sparql-update
+ method: POST
+ - action: bulk
+ suffix: /sparql
+ accept: text/x-nquads
+ method: POST
objects:
domain: us-east-1
prefix:
@@ -30,6 +50,17 @@ objects:
- summoned/r2r
- summoned/ssdbiodp
- summoned/unavco
+ - summoned/glim
+ - summoned/gpp
+ - summoned/nitrogen
+ - summoned/nitrogen2
+ - summoned/hydrography90m
+ - summoned/neon4cast
+ - summoned/usgsrc4cast
+ - summoned/vera4cast
+ - summoned/osmc
+ - summoned/obis
+ - summoned/geochemistry_custom
- prov/aquadocs
- prov/bcodmo
- prov/cchdo
diff --git a/dagster/implnets/configs/eco/tenant.yaml b/dagster/implnets/configs/eco/tenant.yaml
new file mode 100644
index 00000000..8993833c
--- /dev/null
+++ b/dagster/implnets/configs/eco/tenant.yaml
@@ -0,0 +1,41 @@
+# prototype tennants file
+
+# prototype tennants file
+
+tenant:
+ - community: dev
+ hostname: geocodes-dev
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: test
+ summary_namespace: test_summary
+ sources:
+ - iris
+ - geocodes_demo_datasets
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes_test
+ summary_namespace: geocodes_test_summary
+ sources:
+ - all
+# - community: dev3
+# hostname: geocodes-dev32
+# description: GeoCodes is...
+# name: Geocodes Science on Schema
+# url: https://www.earthcube.org
+# logo: https://unsplash.com/random
+# graph:
+# main_namespace: test3
+# summary_namespace: test3_summary
+# sources:
+# - iris
+# - geocodes_examples
diff --git a/dagster/implnets/configs/eco/workspace.yaml b/dagster/implnets/configs/eco/workspace.yaml
index b482024f..286e70e4 100644
--- a/dagster/implnets/configs/eco/workspace.yaml
+++ b/dagster/implnets/configs/eco/workspace.yaml
@@ -14,10 +14,14 @@ load_from:
port: 4000
location_name: "tasks"
- grpc_server:
- host: dagster-code-project
+ host: dagster-code-ingest
port: 4000
- location_name: "project_grpc"
+ location_name: "ingest"
+# - grpc_server:
+# host: dagster-code-project
+# port: 4000
+# location_name: "project_grpc"
- grpc_server:
host: dagster-code-eco-ecrr
port: 4000
- location_name: "project_ecrr"
+ location_name: "ecrr"
diff --git a/dagster/implnets/configs/ecrr/gleanerconfig.yaml b/dagster/implnets/configs/ecrr/gleanerconfig.yaml
index 69fa76e5..fa4fbce4 100644
--- a/dagster/implnets/configs/ecrr/gleanerconfig.yaml
+++ b/dagster/implnets/configs/ecrr/gleanerconfig.yaml
@@ -20,6 +20,26 @@ minio:
accesskey:
secretkey:
sources:
+ # not a sitemap... an s3 directory.. readable by nabu... but this will be difficult.
+# - sourcetype: sitemap
+# name: resource_registry
+# logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
+# url: https://object.cloud.sdsc.edu/v1/AUTH_85f46aa78936477d8e71b186269414e8/gleaner-summoned
+# headless: false
+# pid: ""
+# propername: Resource Registry
+# domain: http://www.earthcube.org/resourceregistry/
+# active: true
+# credentialsfile: ""
+# other: { }
+# headlesswait: 0
+# delay: 0
+# identifierpath: $.@id
+# apipagelimit: 0
+# identifiertype: identifierstring
+# fixcontextoption: 0
+# acceptcontenttype: application/ld+json, text/html
+# jsonprofile: application/ld+json
- sourcetype: googledrive
name: ecrr_submitted
logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
@@ -42,7 +62,7 @@ sources:
- sourcetype: sitemap
name: ecrr_examples
logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png
- url: https://raw.githubusercontent.com/earthcube/ecrro/gh-pages/Examples/sitemap.xml
+ url: https://earthcube.github.io/ecrro/Examples/sitemap.xml
headless: false
pid: ""
propername: Earthcube Resource Registry Examples
diff --git a/dagster/implnets/configs/ecrr/nabuconfing.yaml b/dagster/implnets/configs/ecrr/nabuconfing.yaml
index dc65407d..9631a62e 100644
--- a/dagster/implnets/configs/ecrr/nabuconfing.yaml
+++ b/dagster/implnets/configs/ecrr/nabuconfing.yaml
@@ -15,10 +15,27 @@ objects:
- org
prefixoff:
- summoned/ecrr_examples
-sparql:
- endpoint: http://localhost/blazegraph/namespace/earthcube/sparql
+implementation_network:
+ orgname: ecrr
+endpoints:
+ - service: ec_blazegraph
+ baseurl: http://localhost/blazegraph/namespace/earthcube/sparql
+ type: blazegraph
authenticate: false
- username: ""
- password: ""
+ username:
+ password:
+ modes:
+ - action: sparql
+ suffix: /sparql
+ accept: application/sparql-results+json
+ method: GET
+ - action: update
+ suffix: /sparql
+ accept: application/sparql-update
+ method: POST
+ - action: bulk
+ suffix: /sparql
+ accept: text/x-nquads
+ method: POST
txtaipkg:
endpoint: http://0.0.0.0:8000
diff --git a/dagster/implnets/configs/iow/workspace.yaml b/dagster/implnets/configs/iow/workspace.yaml
index 22d79140..0b4702cb 100644
--- a/dagster/implnets/configs/iow/workspace.yaml
+++ b/dagster/implnets/configs/iow/workspace.yaml
@@ -14,9 +14,13 @@ load_from:
port: 4000
location_name: "tasks"
- grpc_server:
- host: dagster-code-project
+ host: dagster-code-ingest
port: 4000
- location_name: "project_grpc"
+ location_name: "ingest"
+# - grpc_server:
+# host: dagster-code-project
+# port: 4000
+# location_name: "project_grpc"
# - grpc_server:
# host: dagster-code-iow-ecrr
# port: 4000
diff --git a/dagster/implnets/configs/nsdf/workspace.yaml b/dagster/implnets/configs/nsdf/workspace.yaml
index 2b92249b..62667004 100644
--- a/dagster/implnets/configs/nsdf/workspace.yaml
+++ b/dagster/implnets/configs/nsdf/workspace.yaml
@@ -7,5 +7,12 @@ load_from:
# relative_path: "workflows/ecrr/repositories/repository.py"
# working_directory: "./workflows/ecrr/"
# module starting out with the definitions api
- - python_module: "workflows.tasks.tasks"
+ - grpc_server:
+ host: dagster-code-ingest
+ port: 4000
+ location_name: "ingest"
+# - grpc_server:
+# host: dagster-code-project
+# port: 4000
+# location_name: "project_grpc"
diff --git a/dagster/implnets/configs/oih/workspace.yaml b/dagster/implnets/configs/oih/workspace.yaml
index 8ceb08fe..ded83cee 100644
--- a/dagster/implnets/configs/oih/workspace.yaml
+++ b/dagster/implnets/configs/oih/workspace.yaml
@@ -7,5 +7,12 @@ load_from:
# relative_path: "workflows/ecrr/repositories/repository.py"
# working_directory: "./workflows/ecrr/"
# module starting out with the definitions api
- - python_module: "workflows.tasks.tasks"
+ - grpc_server:
+ host: dagster-code-ingest
+ port: 4000
+ location_name: "ingest"
+ # - grpc_server:
+ # host: dagster-code-project
+ # port: 4000
+ # location_name: "project_grpc"
diff --git a/dagster/implnets/deployment/compose.yaml b/dagster/implnets/deployment/compose.yaml
index 0e1ccba7..3431d06e 100644
--- a/dagster/implnets/deployment/compose.yaml
+++ b/dagster/implnets/deployment/compose.yaml
@@ -80,7 +80,7 @@ services:
- "traefik.http.services.sched-${PROJECT:-eco}.loadbalancer.server.port=3000"
- "traefik.docker.network=traefik_proxy"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolallowmethods=GET,OPTIONS,POST"
- - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolalloworigin=*"
+ - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accessControlAllowOriginList=*"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolmaxage=100"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.addvaryheader=true"
dagster-daemon:
diff --git a/dagster/implnets/deployment/compose_local.yaml b/dagster/implnets/deployment/compose_local.yaml
index fd935c45..6ed3d4b7 100644
--- a/dagster/implnets/deployment/compose_local.yaml
+++ b/dagster/implnets/deployment/compose_local.yaml
@@ -11,15 +11,16 @@ networks:
volumes:
dagster-postgres:
driver: local
-
+# dagster-storage:
+# driver: local
# let yourself use local configuration
configs:
- gleaner:
- name: ${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
- file: ../configs/${PROJECT:-eco}/gleanerconfig.yaml
- nabu:
- name: ${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- file: ../configs/${PROJECT:-eco}/nabuconfig.yaml
+# gleaner:
+# name: ${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+# file: ../configs/${PROJECT:-eco}/gleanerconfig.yaml
+# nabu:
+# name: ${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+# file: ../configs/${PROJECT:-eco}/nabuconfig.yaml
workspace:
name: ${GLEANERIO_WORKSPACE_DOCKER_CONFIG:-workspace}
file: ../configs/${PROJECT:-eco}/workspace.yaml
@@ -46,49 +47,62 @@ services:
# gid: "103"
mode:
0444
- - source: gleaner
- target: /scheduler/gleanerconfig.yaml
- mode:
- 0444
- - source: nabu
- target: /scheduler/nabuconfig.yaml
- # uid: "103"
- # gid: "103"
- mode:
- 044
+# - source: gleaner
+# target: /scheduler/gleanerconfig.yaml
+# mode:
+# 0444
+# - source: nabu
+# target: /scheduler/nabuconfig.yaml
+# # uid: "103"
+# # gid: "103"
+# mode:
+# 044
volumes: &vol
- ../deployment/dagster.yaml:/usr/src/app/dagster.yaml
- - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco}
+ # - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco}
- ../workflows/:/usr/src/app/workflows
# GLEANEERIO_ the environment variables for this stack, passed into containers
# the variables passed into the containers varies due to inconsistent standards.
# this there are prefixed by project aka ECO_ for customization
# DO NOT RENAME THE FIRST PART, aka the container environment variable,
# unless you sure what you are doing
+ # sort these in BBedit to make finding them easier
environment: &env
- - DEBUG=${DEBUG:-false}
- - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- - PORTAINER_URL=${PORTAINER_URL}
- - PORTAINER_KEY=${PORTAINER_KEY}
+ - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false}
+ - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/}
+ - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/}
+ - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300}
+ - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+ - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK}
+ - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+ - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL}
+ - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG}
+ - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE}
- - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
- - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX}
- - GLEANERIO_SUMMARY_GRAPH_ENDPOINT=${GLEANERIO_SUMMARY_GRAPH_ENDPOINT}
- - GLEANERIO_SUMMARY_GRAPH_NAMESPACE=${GLEANERIO_SUMMARY_GRAPH_NAMESPACE}
+ - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false}
+ - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}}
+ - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE}
+ - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
+ - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
+ - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/}
+ - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
- GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS}
- - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET}
- - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
+ - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY}
- - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
- - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK}
- - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
- - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml}
- - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
+ - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY}
+ - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly}
+ - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-"America/Los_Angeles"}
+ - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
+ - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
+ - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
+ - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
+ - SLACK_TOKEN=${SLACK_TOKEN}
ports:
- 3000:3000
@@ -96,7 +110,8 @@ services:
- traefik_proxy
depends_on:
- dagster-postgres
- - dagster-code-project
+# - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
labels:
- "traefik.enable=true"
@@ -114,7 +129,7 @@ services:
- "traefik.http.services.sched.loadbalancer.server.port=3000"
- "traefik.docker.network=traefik_proxy"
- "traefik.http.middlewares.sched.headers.accesscontrolallowmethods=GET,OPTIONS,POST"
- - "traefik.http.middlewares.sched.headers.accesscontrolalloworigin=*"
+ - "traefik.http.middlewares.sched.headers.accessControlAllowOriginList=*"
- "traefik.http.middlewares.sched.headers.accesscontrolmaxage=100"
- "traefik.http.middlewares.sched.headers.addvaryheader=true"
dagster-daemon:
@@ -134,7 +149,8 @@ services:
volumes: *vol
depends_on:
- dagster-postgres
- - dagster-code-project
+# - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
networks:
- traefik_proxy
@@ -192,14 +208,13 @@ services:
- traefik_proxy
- headless
# in code, use names defined in network above
-
- dagster-code-tasks:
+ dagster-code-ingest:
platform: linux/x86_64
build:
#context: .
context: ..
- dockerfile: build/Dockerfile_code
+ dockerfile: build/Dockerfile_workflows
args:
implnet: ${PROJECT:-eco}
# you should be able to change the source locally, without a full rebuild.
@@ -216,7 +231,7 @@ services:
- "-p"
- "4000"
- "-m"
- - "workflows.tasks.tasks"
+ - "workflows.ingest.ingest"
- "-d"
- "/usr/src/app/"
@@ -225,13 +240,14 @@ services:
- dagster-postgres
networks:
- traefik_proxy
- dagster-code-project:
+
+ dagster-code-tasks:
platform: linux/x86_64
build:
#context: .
context: ..
- dockerfile: build/Dockerfile_code
+ dockerfile: build/Dockerfile_workflows
args:
implnet: ${PROJECT:-eco}
# you should be able to change the source locally, without a full rebuild.
@@ -247,13 +263,45 @@ services:
- "0.0.0.0"
- "-p"
- "4000"
- - "--python-file"
- - "/usr/src/app/project/${PROJECT:-eco}/repositories/repository.py"
+ - "-m"
+ - "workflows.tasks.tasks"
- "-d"
- - "/usr/src/app/project/${PROJECT:-eco}/"
+ - "/usr/src/app/"
volumes: *vol
depends_on:
- dagster-postgres
networks:
- traefik_proxy
+# dagster-code-project:
+#
+# platform: linux/x86_64
+# build:
+# #context: .
+# context: ..
+# dockerfile: build/Dockerfile_code
+# args:
+# implnet: ${PROJECT:-eco}
+# # you should be able to change the source locally, without a full rebuild.
+# #image: dagster-${PROJECT:-eco}:latest
+# image: dagster-gleanerio-local:latest
+#
+# environment: *env
+# command:
+# - "dagster"
+# - "api"
+# - "grpc"
+# - "-h"
+# - "0.0.0.0"
+# - "-p"
+# - "4000"
+# - "--python-file"
+# - "/usr/src/app/project/${PROJECT:-eco}/repositories/repository.py"
+# - "-d"
+# - "/usr/src/app/project/${PROJECT:-eco}/"
+#
+# volumes: *vol
+# depends_on:
+# - dagster-postgres
+# networks:
+# - traefik_proxy
diff --git a/dagster/implnets/deployment/compose_local_eco_override.yaml b/dagster/implnets/deployment/compose_local_eco_override.yaml
index bb160303..4e39299c 100644
--- a/dagster/implnets/deployment/compose_local_eco_override.yaml
+++ b/dagster/implnets/deployment/compose_local_eco_override.yaml
@@ -12,7 +12,7 @@ services:
volumes: &vol
- ../configs/${PROJECT:-eco}/workspace.yaml:/usr/src/app/workspace.yaml
- ../deployment/dagster.yaml:/usr/src/app/dagster.yaml
- - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco}
+ # - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco}
- ../workflows/:/usr/src/app/workflows
# GLEANEERIO_ the environment variables for this stack, passed into containers
# the variables passed into the containers varies due to inconsistent standards.
@@ -20,35 +20,50 @@ services:
# DO NOT RENAME THE FIRST PART, aka the container environment variable,
# unless you sure what you are doing
environment: &env
- - DEBUG=${DEBUG:-false}
- - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- - PORTAINER_URL=${PORTAINER_URL}
- - PORTAINER_KEY=${PORTAINER_KEY}
+ - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false}
+ - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/}
+ - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/}
+ - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300}
+ - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+ - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK}
+ - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+ - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL}
+ - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG}
+ - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE}
- - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
- - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX}
+ - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false}
+ - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}}
+ - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE}
+ - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
+ - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
+ - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/}
+ - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
- GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS}
- - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET}
- - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
+ - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY}
- - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
- - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK}
- - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
- - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml}
- - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
- - ECRR_MINIO_BUCKET="ECRR"
- - ECRR_GRAPH_NAMESPACE="ECRR"
+ - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY}
+ - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly}
+ - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-"America/Los_Angeles"}
+ - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
+ - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
+ - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
+ - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
+ - ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET}
+ - ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE}
+ - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
+ - SLACK_TOKEN=${SLACK_TOKEN}
# caution for a portainer additional file deply
# portainer issue, merging commands may need to create a combined customised on
command: 'dagster-webserver -w workspace.yaml -h "0.0.0.0" -p 3000'
depends_on: &deps
- dagster-postgres
- - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
- dagster-code-eco-ecrr
@@ -82,10 +97,10 @@ services:
- "0.0.0.0"
- "-p"
- "4000"
- - "--python-file"
- - "/usr/src/app/workflows/ecrr/repositories/repository.py"
+ - "-m"
+ - "workflows.ecrr.ecrr"
- "-d"
- - "/usr/src/app/workflows/ecrr/"
+ - "/usr/src/app/"
volumes: *vol
depends_on:
diff --git a/dagster/implnets/deployment/compose_local_iow_override.yaml b/dagster/implnets/deployment/compose_local_iow_override.yaml
index 2179a294..faae1627 100644
--- a/dagster/implnets/deployment/compose_local_iow_override.yaml
+++ b/dagster/implnets/deployment/compose_local_iow_override.yaml
@@ -12,7 +12,7 @@ services:
volumes: &vol
- ../configs/${PROJECT:-iow}/workspace.yaml:/usr/src/app/workspace.yaml
- ../deployment/dagster.yaml:/usr/src/app/dagster.yaml
- - ../generatedCode/implnet-${PROJECT:-iow}/output/:/usr/src/app/project/${PROJECT:-iow}
+# - ../generatedCode/implnet-${PROJECT:-iow}/output/:/usr/src/app/project/${PROJECT:-iow}
- ../workflows/:/usr/src/app/workflows
# GLEANEERIO_ the environment variables for this stack, passed into containers
# the variables passed into the containers varies due to inconsistent standards.
@@ -20,27 +20,40 @@ services:
# DO NOT RENAME THE FIRST PART, aka the container environment variable,
# unless you sure what you are doing
environment: &env
- - DEBUG=${DEBUG:-false}
- - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- - PORTAINER_URL=${PORTAINER_URL}
- - PORTAINER_KEY=${PORTAINER_KEY}
+ - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false}
+ - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/}
+ - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/}
+ - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300}
+ - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+ - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK}
+ - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+ - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL}
+ - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG}
+ - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE}
- - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false}
+ - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}}
+ - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE}
+ - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
+ - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
+ - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/}
- GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX}
+ - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
- GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS}
- - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET}
- - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
+ - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY}
- - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
- - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK}
- - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
- - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml}
- - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
+ - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY}
+ - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
+ - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
+ - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
+ - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
+ - SLACK_TOKEN=${SLACK_TOKEN}
# - ECRR_MINIO_BUCKET="ECRR"
# - ECRR_GRAPH_NAMESPACE="ECRR"
# caution for a portainer additional file deply
@@ -48,7 +61,7 @@ services:
command: 'dagster-webserver -w workspace.yaml -h "0.0.0.0" -p 3000'
depends_on: &deps
- dagster-postgres
- - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
# - dagster-code-iow-ecrr
diff --git a/dagster/implnets/deployment/compose_project.yaml b/dagster/implnets/deployment/compose_project.yaml
index 51e1d30b..81be7d32 100644
--- a/dagster/implnets/deployment/compose_project.yaml
+++ b/dagster/implnets/deployment/compose_project.yaml
@@ -25,18 +25,22 @@ networks:
volumes:
dagster-postgres:
driver: local
+ dagster-storage:
+ driver: local
# external so it could be shared accross docker swarms
configs:
- gleaner:
- name: ${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
- external: true
- nabu:
- name: ${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- external: true
+# gleaner:
+# name: ${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+# external: true
+# nabu:
+# name: ${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+# external: true
workspace:
- name: ${GLEANERIO_WORKSPACE_DOCKER_CONFIG:-workspace}
+ name: ${GLEANERIO_DOCKER_WORKSPACE_CONFIG:-workspace}
+ external: true
+ dagster:
+ name: ${GLEANERIO_DOCKER_DAGSTER_CONFIG:-dagster}
external: true
-
secrets:
MINIO_ROOT_ACCESS_KEY:
external: true
@@ -60,16 +64,22 @@ services:
# gid: "103"
mode:
0444
- - source: gleaner
- target: /scheduler/gleanerconfig.yaml
- mode:
- 044
- - source: nabu
- target: /scheduler/nabuconfig.yaml
+ - source: dagster
+ target: /usr/src/app/dagster.yaml
# uid: "103"
# gid: "103"
mode:
- 044
+ 0444
+# - source: gleaner
+# target: /scheduler/gleanerconfig.yaml
+# mode:
+# 044
+# - source: nabu
+# target: /scheduler/nabuconfig.yaml
+# # uid: "103"
+# # gid: "103"
+# mode:
+# 044
secrets:
- MINIO_ROOT_ACCESS_KEY
- MINIO_ROOT_SECRET_KEY
@@ -79,31 +89,43 @@ services:
# DO NOT RENAME THE FIRST PART, aka the container environment variable,
# unless you sure what you are doing
environment: &env
- - DEBUG=${DEBUG:-false}
- - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- - PORTAINER_URL=${PORTAINER_URL}
- - PORTAINER_KEY=${PORTAINER_KEY}
+ # sort these in BBedit to make finding them easier
+ - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false}
+ - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/}
+ - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/}
+ - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300}
+ - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+ - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK}
+ - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+ - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL}
+ - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG}
+ - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE}
- - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
- - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX}
+ - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false}
+ - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}}
+ - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE}
+ - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
+ - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
+ - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/}
+ - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
- GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS}
- - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET}
- - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
+ - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY}
- - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
- - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK}
- - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
- - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml}
- - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
- - GLEANERIO_SUMMARY_GRAPH_ENDPOINT=${GLEANERIO_SUMMARY_GRAPH_ENDPOINT}
- - GLEANERIO_SUMMARY_GRAPH_NAMESPACE=${GLEANERIO_SUMMARY_GRAPH_NAMESPACE}
-# - GLEANER_MINIO_KEY=/run/secrets/MINIO_ROOT_ACCESS_KEY
-# - GLEANER_MINIO_SECRET=/run/secrets/MINIO_ROOT_SECRET_KEY
+ - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY}
+ - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly}
+ - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-America/Los_Angeles}
+ - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
+ - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
+ - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
+ - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
+ - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
+ - SLACK_TOKEN=${SLACK_TOKEN}
ports:
- 3000:3000
@@ -116,7 +138,7 @@ services:
- "traefik.enable=true"
- "traefik.http.routers.sched-${PROJECT:-eco}.entrypoints=http"
- "traefik.http.routers.sched-${PROJECT:-eco}.priority=13"
- - "traefik.http.routers.sched-${PROJECT:-eco}.rule=Host(`sched.${HOST? HOST is required}`)"
+ - "traefik.http.routers.sched-${PROJECT:-eco}.rule=Host(`${SCHED_HOSTNAME:-sched}.${HOST? HOST is required}`)"
- "traefik.http.middlewares.sched-https-redirect.redirectscheme.scheme=https"
- "traefik.http.routers.sched-${PROJECT:-eco}.middlewares=sched-https-redirect"
- "traefik.http.routers.sched-${PROJECT:-eco}-secure.entrypoints=https"
@@ -128,7 +150,7 @@ services:
- "traefik.http.services.sched-${PROJECT:-eco}.loadbalancer.server.port=3000"
- "traefik.docker.network=traefik_proxy"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolallowmethods=GET,OPTIONS,POST"
- - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolalloworigin=*"
+ - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accessControlAllowOriginList=*"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolmaxage=100"
- "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.addvaryheader=true"
dagster-daemon:
@@ -202,11 +224,42 @@ services:
- dagster_host
- headless
# in code, use names defined in network above
+ dagster-code-ingest:
+
+# build:
+# #context: .
+# context: ..
+# dockerfile: build/Dockerfile_code
+# args:
+# implnet: ${PROJECT:-eco}
+ # you should be able to change the source locally, without a full rebuild.
+ #image: dagster-${PROJECT:-eco}:latest
+ image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest}
+ environment: *env
+ command:
+ - "dagster"
+ - "api"
+ - "grpc"
+ - "-h"
+ - "0.0.0.0"
+ - "-p"
+ - "4000"
+ - "-m"
+ - "workflows.ingest.ingest"
+ - "-d"
+ - "/usr/src/app/"
+
+ volumes: &codevol
+ - dagster-storage:/usr/src/app/storage
+ depends_on:
+ - dagster-postgres
+ networks:
+ - dagster_host
dagster-code-tasks:
# you should be able to change the source locally, without a full rebuild.
#image: dagster-${PROJECT:-eco}:latest
- image: docker.io/nsfearthcube/dagster-gleanerio-${PROJECT:-eco}:${CONTAINER_CODE_TAG:-latest}
+ image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest}
environment: *env
command:
@@ -222,7 +275,7 @@ services:
- "-d"
- "/usr/src/app/"
- # volumes: *vol
+ volumes: *codevol
depends_on:
- dagster-postgres
networks:
@@ -249,4 +302,4 @@ services:
depends_on:
- dagster-postgres
networks:
- - dagster_host
\ No newline at end of file
+ - dagster_host
diff --git a/dagster/implnets/deployment/compose_project_eco_override.yaml b/dagster/implnets/deployment/compose_project_eco_override.yaml
index 13ee8d52..98f93502 100644
--- a/dagster/implnets/deployment/compose_project_eco_override.yaml
+++ b/dagster/implnets/deployment/compose_project_eco_override.yaml
@@ -15,29 +15,43 @@ services:
# DO NOT RENAME THE FIRST PART, aka the container environment variable,
# unless you sure what you are doing
environment: &env
- - DEBUG=${DEBUG:-false}
- - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- - PORTAINER_URL=${PORTAINER_URL}
- - PORTAINER_KEY=${PORTAINER_KEY}
+ - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false}
+ - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/}
+ - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/}
+ - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300}
+ - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner}
+ - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK}
+ - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu}
+ - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL}
+ - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG}
+ - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE}
- - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
- - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX}
+ - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false}
+ - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}}
+ - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE}
+ - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
+ - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
+ - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/}
+ - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
- GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS}
- - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET}
- - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY}
+ - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT}
- GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY}
- - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT}
- - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK}
- - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL}
- - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE}
+ - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL}
- GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml}
- - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml}
- - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu}
- - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner}
- - ECRR_MINIO_BUCKET="ECRR"
- - ECRR_GRAPH_NAMESPACE="ECRR"
+ - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE}
+ - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY}
+ - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly}
+ - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-America/Los_Angeles}
+ - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
+ - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
+ - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
+ - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
+ - ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET}
+ - ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE}
+ - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+ - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
+ - SLACK_TOKEN=${SLACK_TOKEN}
# command:
@@ -49,7 +63,7 @@ services:
# - "-p"
# - "3000"
depends_on: &deps
- - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
dagster-daemon:
@@ -62,7 +76,7 @@ services:
# - "workspace.yaml"
depends_on:
- dagster-postgres
- - dagster-code-project
+ - dagster-code-ingest
- dagster-code-tasks
- dagster-code-eco-ecrr
networks:
@@ -72,7 +86,7 @@ services:
dagster-code-eco-ecrr:
# you should be able to change the source locally, without a full rebuild.
#image: dagster-${PROJECT:-eco}:latest
- image: docker.io/nsfearthcube/dagster-gleanerio-${PROJECT:-eco}:${CONTAINER_CODE_TAG:-latest}
+ image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest}
environment: *env
command:
@@ -83,12 +97,13 @@ services:
- "0.0.0.0"
- "-p"
- "4000"
- - "--python-file"
- - "/usr/src/app/workflows/ecrr/repositories/repository.py"
+ - "-m"
+ - "workflows.ecrr.ecrr"
- "-d"
- - "/usr/src/app/workflows/ecrr/"
+ - "/usr/src/app/"
- #volumes: *vol
+ volumes:
+ - dagster-storage:/usr/src/app/storage
depends_on:
- dagster-postgres
networks:
diff --git a/dagster/implnets/deployment/dagster.yaml b/dagster/implnets/deployment/dagster.yaml
index 9ff78eed..0c726065 100644
--- a/dagster/implnets/deployment/dagster.yaml
+++ b/dagster/implnets/deployment/dagster.yaml
@@ -32,6 +32,7 @@ run_coordinator:
module: dagster.core.run_coordinator
class: QueuedRunCoordinator
config:
- max_concurrent_runs: 4
+# max_concurrent_runs: 4
+ max_concurrent_runs: 2
telemetry:
enabled: false
diff --git a/dagster/implnets/deployment/dagster_localrun.sh b/dagster/implnets/deployment/dagster_localrun.sh
index 0dc626b1..d51b585b 100755
--- a/dagster/implnets/deployment/dagster_localrun.sh
+++ b/dagster/implnets/deployment/dagster_localrun.sh
@@ -18,7 +18,9 @@ do
? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
esac
done
-
+RED='\033[0;31m'
+Yellow='\033[0;33m'
+NC='\033[0m'
if [ ! $envfile ]
then
@@ -32,7 +34,7 @@ if [ -f $envfile ]
export $(sed '/^[ \t]*#/d' $envfile | sed '/^$/d' | xargs)
else
- echo "missing environment file. pass flag, or copy and edit file"
+ echo -e "${RED} missing environment file. pass flag, or copy and edit file${NC}"
echo "cp envFile.env .env"
echo "OR"
echo "cp {yourenv}.env .env"
@@ -43,24 +45,24 @@ fi
## need to docker (network|volume) ls | grep (traefik_proxy|traefik_proxy) before these calll
## or an error will be thrown
#echo "This message is OK **Error response from daemon: network with name traefik_proxy already exists.** "
-if [ "$(docker network ls | grep ${GLEANERIO_HEADLESS_NETWORK})" ] ; then
- echo ${GLEANERIO_HEADLESS_NETWORK} netowrk exists;
+if [ "$(docker network ls | grep ${GLEANERIO_DOCKER_HEADLESS_NETWORK})" ] ; then
+ echo ${GLEANERIO_DOCKER_HEADLESS_NETWORK} netowrk exists;
else
echo creating network
if [ "$(docker info | grep Swarm | sed 's/Swarm: //g' | tr -d ' ')" == "inactive" ]; then
echo Not Swarm
- if `docker network create -d bridge --attachable ${GLEANERIO_HEADLESS_NETWORK}`; then
- echo 'Created network ${GLEANERIO_HEADLESS_NETWORK}'
+ if `docker network create -d bridge --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then
+ echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}'
else
- echo "ERROR: *** Failed to create local network. "
+ echo -e "${RED}ERROR: *** Failed to create local network. ${NC}"
exit 1
fi
else
echo Is Swarm
- if `docker network create -d overlay --attachable ${GLEANERIO_HEADLESS_NETWORK}`; then
- echo 'Created network ${GLEANERIO_HEADLESS_NETWORK}'
+ if `docker network create -d overlay --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then
+ echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}'
else
- echo "ERROR: *** Failed to create swarm network. "
+ echo -e "${RED}ERROR: *** Failed to create swarm network. ${NC}"
exit 1
fi
fi
@@ -70,9 +72,10 @@ fi
#echo NOTE: Verify that the traefik_proxy network SCOPE is swarm
-docker volume create ${GLEANERIO_CONFIG_VOLUME:-dagster_gleaner_configs}
-
-echo DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.
+RED='\033[0;31m'
+Yellow='\033[0;33m'
+NC='\033[0m'
+echo -e ${Yellow}DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.${NC}
echo run as detached: $detached
@@ -89,5 +92,6 @@ if [ "$detached" = true ]
else
docker compose -p dagster --env-file $envfile -f compose_local.yaml $override_file up
fi
+echo -e ${Yellow}DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.${NC}
+echo -e ${Yellow}If gleaner@project_grpc shows in UI as not working, most likely, REGNERATE THE CODE.${NC}
-echo DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.
diff --git a/dagster/implnets/deployment/dagster_setup_docker.sh b/dagster/implnets/deployment/dagster_setup_docker.sh
index 8fd6af07..7ba2f351 100755
--- a/dagster/implnets/deployment/dagster_setup_docker.sh
+++ b/dagster/implnets/deployment/dagster_setup_docker.sh
@@ -41,25 +41,25 @@ fi
## need to docker (network|volume) ls | grep (traefik_proxy|traefik_proxy) before these calll
## or an error will be thrown
#echo "This message is OK **Error response from daemon: network with name traefik_proxy already exists.** "
-if [ "$(docker network ls | grep -${GLEANER_HEADLESS_NETWORK})" ] ; then
- echo ${GLEANER_HEADLESS_NETWORK} netowrk exists;
+if [ "$(docker network ls | grep -${GLEANERIO_DOCKER_HEADLESS_NETWORK})" ] ; then
+ echo ${GLEANERIO_DOCKER_HEADLESS_NETWORK} netowrk exists;
else
echo creating network
if [ "$(docker info | grep Swarm | sed 's/Swarm: //g')" == "inactive" ]; then
echo Not Swarm
- if `docker network create -d bridge --attachable ${GLEANER_HEADLESS_NETWORK}`; then
- echo 'Created network ${GLEANER_HEADLESS_NETWORK}'
+ if `docker network create -d bridge --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then
+ echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}'
else
echo "ERROR: *** Failed to create local network. "
- exit 1
+ # exit 1
fi
else
echo Is Swarm
- if `docker network create -d overlay --attachable ${GLEANER_HEADLESS_NETWORK}`; then
- echo 'Created network ${GLEANER_HEADLESS_NETWORK}'
+ if `docker network create -d overlay --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then
+ echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}'
else
echo "ERROR: *** Failed to create swarm network. "
- exit 1
+ #exit 1
fi
fi
@@ -67,9 +67,10 @@ fi
#echo NOTE: Verify that the traefik_proxy network SCOPE is swarm
-docker volume create ${GLEANER_CONFIG_VOLUME:-dagster_gleaner_configs}
+echo added network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}
-echo added network ${GLEANER_HEADLESS_NETWORK} and volume ${GLEANER_CONFIG_VOLUME}
+docker volume create dagster-postgres
+docker volume create dagster-storage
if [ "$(docker config ls | grep -${GLEANERIO_GLEANER_CONFIG_PATH})" ] ; then
echo ${GLEANERIO_GLEANER_CONFIG_PATH} config exists;
@@ -79,8 +80,9 @@ else
if `docker config create gleaner-${PROJECT} ../configs/${PROJECT}/gleanerconfig.yaml`; then
echo 'Created gleaner config gleaner-${PROJECT} ${GLEANERIO_GLEANER_CONFIG_PATH}'
else
- echo "ERROR: *** Failed to create config. "
- exit 1
+ echo "ERROR: *** Failed to create docker/potainer config. gleaner-${PROJECT} ${GLEANERIO_GLEANER_CONFIG_PATH}"
+ echo "see if config exists "
+ # exit 1
fi
fi
@@ -92,8 +94,9 @@ else
if `docker config create nabu-${PROJECT} ../configs/${PROJECT}/nabuconfig.yaml`; then
echo 'Created gleaner config nabu-${PROJECT} ${GLEANERIO_NABU_CONFIG_PATH}'
else
- echo "ERROR: *** Failed to create config. "
- exit 1
+ echo "ERROR: *** Failed to create create docker/potainer config. nabu-${PROJECT} ${GLEANERIO_NABU_CONFIG_PATH} "
+ echo "see if config exists "
+ # exit 1
fi
fi
@@ -105,7 +108,22 @@ else
if `docker config create workspace-${PROJECT} ../configs/${PROJECT}/workspace.yaml`; then
echo 'Created gleaner config workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}'
else
- echo "ERROR: *** Failed to create config. "
- exit 1
+ echo "ERROR: *** Failed to create create docker/potainer config. workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}"
+ echo "see if config exists "
+ # exit 1
+ fi
+fi
+
+if [ "$(docker config ls | grep -${GLEANERIO_WORKSPACE_CONFIG_PATH})" ] ; then
+ echo ${GLEANERIO_WORKSPACE_CONFIG_PATH} config exists;
+else
+ echo creating config
+
+ if `docker config create workspace-${PROJECT} ../configs/${PROJECT}/workspace.yaml`; then
+ echo 'Created gleaner config workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}'
+ else
+ echo "ERROR: *** Failed to create create docker/potainer config. workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}"
+ echo "see if config exists "
+ # exit 1
fi
fi
diff --git a/dagster/implnets/deployment/envFile.env b/dagster/implnets/deployment/envFile.env
index 4fa2699d..994418e5 100644
--- a/dagster/implnets/deployment/envFile.env
+++ b/dagster/implnets/deployment/envFile.env
@@ -1,42 +1,72 @@
+DAGSTER_HOME=dagster/dagster_home
+## PROJECT -- default 'eco' this is a 'TRAEFIK router name' use to run multiple copies of scheduler on a server
+# originally used to generate code for a specific project
+#PROJECT=test
+
+#PROJECT=eco
+#PROJECT=iow
+#PROJECT=oih
######
# Nabu and Gleaner configs need to be in docker configs
## docker config name GLEANER_GLEANER_DOCKER_CONFIG
## docker config name GLEANER_NABU_DOCKER_CONFIG
# suggested DOCKER_CONFIG NAMING PATTERN (nabu||gleaner)-{PROJECT}
########
-GLEANERIO_GLEANER_DOCKER_CONFIG=gleaner-eco
-GLEANERIO_NABU_DOCKER_CONFIG=nabu-eco
+GLEANERIO_DOCKER_GLEANER_CONFIG=gleaner-eco
+GLEANERIO_DOCKER_NABU_CONFIG=nabu-eco
# ###
# workspace for dagster
####
GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml
-GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco
+GLEANERIO_DOCKER_WORKSPACE_CONFIG=workspace-eco
+
+GLEANERIO_DOCKER_DAGSTER_CONFIG=dagster
+
+
+DEBUG_CONTAINER=false
+
+#### HOST
+# host base name for treafik. fixed to localhost:3000 when using compose_local.
+HOST=localhost
+# Applies only to compose_project.yaml runs
+
+# modify SCHED_HOSTNAME is you want to run more than one instance
+# aka two different project havests for now.
+SCHED_HOSTNAME=sched
+
+GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=300
+# debugging set to 10 - 30 seconds
+# DEFAULT SCHEDULE
+# as defined by https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules#basic-schedules
+# "@hourly", "@daily", "@weekly", and "@monthly"
+#GLEANERIO_DEFAULT_SCHEDULE=@weekly
+#GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE="America/Los_Angeles"
+# the above a used as hard coded os.getenv(), so when changed, service needs to be restarted.
-DEBUG=False
-PROJECT=eco
+
+# tags for docker compose
CONTAINER_CODE_TAG=latest
CONTAINER_DAGSTER_TAG=latest
-#PROJECT=iow
-#PROJECT=oih
-HOST=localhost
+
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
-# port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/
-PORTAINER_URL=
-PORTAINER_KEY=
+# port is required: https://portainer.{HOST}:443/api/endpoints/9/docker/
+# 9 is dataloader, 2 is aws-dev
+GLEANERIO_DOCKER_URL=https://portainer.{HOST}:443/api/endpoints/9/docker/
+GLEANERIO_PORTAINER_APIKEY=
# if running dagster-dev, then this needs to be set ,
# defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers
# when debugging generated code "../../../configs/eco/gleanerconfig.yaml"
# when debugging code in workflows "../../configs/eco/gleanerconfig.yaml"
-# DAGSTER_GLEANER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml
+GLEANERIO_DAGSTER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml
# Network
-GLEANERIO_HEADLESS_NETWORK=headless_gleanerio
+GLEANERIO_DOCKER_HEADLESS_NETWORK=headless_gleanerio
### GLEANER/NABU Dockers
-GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest
-GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest
+GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:dev_ec
+GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:dev_eco
##
# path where configs are deployed/mounted
@@ -56,10 +86,27 @@ GLEANERIO_MINIO_SECRET_KEY=
GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222
# just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph
-GLEANERIO_GRAPH_URL=
-GLEANERIO_GRAPH_NAMESPACE=
+GLEANERIO_GRAPH_URL=https://graph.geocodes-aws.earthcube.org/blazegraph
+GLEANERIO_GRAPH_NAMESPACE=earthcube
+
+# optional: GLEANERIO_GRAPH_SUMMARY_ENDPOINT defaults to GLEANERIO_GRAPH_URL
+#GLEANERIO_GRAPH_SUMMARY_ENDPOINT=https://graph.geocodes-aws-dev.earthcube.org/blazegraph
+GLEANERIO_GRAPH_SUMMARY_NAMESPACE=earthcube_summary
+GLEANERIO_GRAPH_SUMMARIZE=True
+
+# where are the gleaner and tennant configurations
+GLEANERIO_CONFIG_PATH=scheduler/configs/
+GLEANERIO_TENANT_FILENAME=tenant.yaml
+GLEANERIO_SOURCES_FILENAME=gleanerconfig.yaml
+
+# ECO Custom variables for ecrr
+ECRR_GRAPH_NAMESPACE=ecrr
+ECRR_MINIO_BUCKET=ecrr
+
+# only a public slack channel works. DV has no permissions to create a new channel
+SLACK_CHANNEL="#production_discussion"
+#SLACK_CHANNEL="#twitterfeed"
+SLACK_TOKEN=
-# example: https://graph.geocodes.ncsa.illinois.edu/blazegraph/namespace/yyearthcube2/sparql
-#graph endpoint will be GLEANERIO_GRAPH_URL
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE=
+GLEANERIO_CSV_CONFIG_URL=https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv
diff --git a/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md b/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md
new file mode 100644
index 00000000..0a897237
--- /dev/null
+++ b/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md
@@ -0,0 +1,18 @@
+
+
+# RUNNING LOCALLY
+* You need to point at a docker STACK, or portainer endpoint... A local workstation docker is usually not a STACK.
+* set the ENV variables; I use the env file plugin in pycharm
+*
+`cd dagster/implnets/generatedCode/implnet-eco/output
+python -m dagster dev `
+
+## To run a job:
+`cd dagster/implnets/generatedCode/implnet-eco/output
+python -m dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_jobs_ecrr_examples`
+
+## also can just be dagster
+cd dagster/implnets/generatedCode/implnet-eco/output
+(do some magic env: eg `export $(sed '/^[ \t]*#/d' $envfile | sed '/^$/d' | xargs)` )
+* dagster job list -w workspace.yaml
+* dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_jobs_ecrr_examples
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py
deleted file mode 100644
index b850c609..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_amgeo import harvest_amgeo
-
-@job
-def implnet_job_amgeo():
- harvest_amgeo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py
deleted file mode 100644
index 8d384135..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_aquadocs import harvest_aquadocs
-
-@job
-def implnet_job_aquadocs():
- harvest_aquadocs()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py
deleted file mode 100644
index d5db3109..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_bcodmo import harvest_bcodmo
-
-@job
-def implnet_job_bcodmo():
- harvest_bcodmo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py
deleted file mode 100644
index d8380c4c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cchdo import harvest_cchdo
-
-@job
-def implnet_job_cchdo():
- harvest_cchdo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py
deleted file mode 100644
index ec41e220..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_datadiscoverystudio import harvest_datadiscoverystudio
-
-@job
-def implnet_job_datadiscoverystudio():
- harvest_datadiscoverystudio()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py
deleted file mode 100644
index 122ba25c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_designsafe import harvest_designsafe
-
-@job
-def implnet_job_designsafe():
- harvest_designsafe()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py
deleted file mode 100644
index 31fdf4e1..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_earthchem import harvest_earthchem
-
-@job
-def implnet_job_earthchem():
- harvest_earthchem()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py
deleted file mode 100644
index cb51bb8d..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ecrr_examples import harvest_ecrr_examples
-
-@job
-def implnet_job_ecrr_examples():
- harvest_ecrr_examples()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py
deleted file mode 100644
index 1f1a7229..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_edi import harvest_edi
-
-@job
-def implnet_job_edi():
- harvest_edi()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py
deleted file mode 100644
index 144da333..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_geocodes_demo_datasets import harvest_geocodes_demo_datasets
-
-@job
-def implnet_job_geocodes_demo_datasets():
- harvest_geocodes_demo_datasets()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py
deleted file mode 100644
index bf0435ac..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_geocodes_examples import harvest_geocodes_examples
-
-@job
-def implnet_job_geocodes_examples():
- harvest_geocodes_examples()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py
deleted file mode 100644
index 515e329a..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hydroshare import harvest_hydroshare
-
-@job
-def implnet_job_hydroshare():
- harvest_hydroshare()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py
deleted file mode 100644
index 44478429..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_iedadata import harvest_iedadata
-
-@job
-def implnet_job_iedadata():
- harvest_iedadata()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py
deleted file mode 100644
index 2cf195c3..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_iris import harvest_iris
-
-@job
-def implnet_job_iris():
- harvest_iris()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py
deleted file mode 100644
index 57b17ed1..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_linkedearth import harvest_linkedearth
-
-@job
-def implnet_job_linkedearth():
- harvest_linkedearth()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py
deleted file mode 100644
index 57daa163..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_lipdverse import harvest_lipdverse
-
-@job
-def implnet_job_lipdverse():
- harvest_lipdverse()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py
deleted file mode 100644
index 7a8140d6..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_magic import harvest_magic
-
-@job
-def implnet_job_magic():
- harvest_magic()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py
deleted file mode 100644
index 403d9f90..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_neon import harvest_neon
-
-@job
-def implnet_job_neon():
- harvest_neon()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py
deleted file mode 100644
index 4197a852..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_neotomadb import harvest_neotomadb
-
-@job
-def implnet_job_neotomadb():
- harvest_neotomadb()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py
deleted file mode 100644
index a70c00a4..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_opencoredata import harvest_opencoredata
-
-@job
-def implnet_job_opencoredata():
- harvest_opencoredata()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py
deleted file mode 100644
index c59db610..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_opentopography import harvest_opentopography
-
-@job
-def implnet_job_opentopography():
- harvest_opentopography()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py
deleted file mode 100644
index ed4bbaae..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_r2r import harvest_r2r
-
-@job
-def implnet_job_r2r():
- harvest_r2r()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py
deleted file mode 100644
index 582d1c9a..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_resource_registry import harvest_resource_registry
-
-@job
-def implnet_job_resource_registry():
- harvest_resource_registry()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py
deleted file mode 100644
index 0c2a7d32..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ssdbiodp import harvest_ssdbiodp
-
-@job
-def implnet_job_ssdbiodp():
- harvest_ssdbiodp()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py
deleted file mode 100644
index 86289be2..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ucar import harvest_ucar
-
-@job
-def implnet_job_ucar():
- harvest_ucar()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py
deleted file mode 100644
index 5b339869..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_unavco import harvest_unavco
-
-@job
-def implnet_job_unavco():
- harvest_unavco()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py
deleted file mode 100644
index 065cb671..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_unidata import harvest_unidata
-
-@job
-def implnet_job_unidata():
- harvest_unidata()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py
deleted file mode 100644
index f4f13a75..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_usapdc import harvest_usapdc
-
-@job
-def implnet_job_usapdc():
- harvest_usapdc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py
deleted file mode 100644
index 10b38a00..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wifire import harvest_wifire
-
-@job
-def implnet_job_wifire():
- harvest_wifire()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py
deleted file mode 100644
index a0618d00..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def amgeo_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def amgeo_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "amgeo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "amgeo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "amgeo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "amgeo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_naburelease(context):
- returned_value = gleanerio(context,("release"), "amgeo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def amgeo_uploadrelease(context):
- returned_value = post_to_graph("amgeo", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def amgeo_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def amgeo_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def amgeo_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def amgeo_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "amgeo"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def amgeo_upload_summarize(context):
- returned_value = post_to_graph("amgeo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="amgeo"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="amgeo"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_amgeo():
- containers = amgeo_getImage()
- harvest = amgeo_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = amgeo_missingreport_s3(start=harvest)
- report_idstat = amgeo_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = amgeo_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="amgeo")
- load_release = amgeo_naburelease(start=harvest)
- load_uploadrelease = amgeo_uploadrelease(start=load_release)
-
- load_prune = amgeo_nabu_prune(start=load_uploadrelease)
- load_prov = amgeo_nabuprov(start=load_prune)
- load_org = amgeo_nabuorg(start=load_prov)
-
- summarize = amgeo_summarize(start=load_uploadrelease)
- upload_summarize = amgeo_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = amgeo_missingreport_graph(start=summarize)
- report_graph = amgeo_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py
deleted file mode 100644
index f4de1374..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def aquadocs_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def aquadocs_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_naburelease(context):
- returned_value = gleanerio(context,("release"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aquadocs_uploadrelease(context):
- returned_value = post_to_graph("aquadocs", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aquadocs_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aquadocs_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aquadocs"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def aquadocs_upload_summarize(context):
- returned_value = post_to_graph("aquadocs",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="aquadocs"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="aquadocs"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_aquadocs():
- containers = aquadocs_getImage()
- harvest = aquadocs_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = aquadocs_missingreport_s3(start=harvest)
- report_idstat = aquadocs_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = aquadocs_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="aquadocs")
- load_release = aquadocs_naburelease(start=harvest)
- load_uploadrelease = aquadocs_uploadrelease(start=load_release)
-
- load_prune = aquadocs_nabu_prune(start=load_uploadrelease)
- load_prov = aquadocs_nabuprov(start=load_prune)
- load_org = aquadocs_nabuorg(start=load_prov)
-
- summarize = aquadocs_summarize(start=load_uploadrelease)
- upload_summarize = aquadocs_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = aquadocs_missingreport_graph(start=summarize)
- report_graph = aquadocs_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py
deleted file mode 100644
index 92e2d858..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def bcodmo_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def bcodmo_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "bcodmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "bcodmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "bcodmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "bcodmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_naburelease(context):
- returned_value = gleanerio(context,("release"), "bcodmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def bcodmo_uploadrelease(context):
- returned_value = post_to_graph("bcodmo", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def bcodmo_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def bcodmo_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "bcodmo"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def bcodmo_upload_summarize(context):
- returned_value = post_to_graph("bcodmo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="bcodmo"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="bcodmo"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_bcodmo():
- containers = bcodmo_getImage()
- harvest = bcodmo_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = bcodmo_missingreport_s3(start=harvest)
- report_idstat = bcodmo_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = bcodmo_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="bcodmo")
- load_release = bcodmo_naburelease(start=harvest)
- load_uploadrelease = bcodmo_uploadrelease(start=load_release)
-
- load_prune = bcodmo_nabu_prune(start=load_uploadrelease)
- load_prov = bcodmo_nabuprov(start=load_prune)
- load_org = bcodmo_nabuorg(start=load_prov)
-
- summarize = bcodmo_summarize(start=load_uploadrelease)
- upload_summarize = bcodmo_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = bcodmo_missingreport_graph(start=summarize)
- report_graph = bcodmo_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py
deleted file mode 100644
index b2a86804..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cchdo_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cchdo_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cchdo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cchdo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cchdo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cchdo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_naburelease(context):
- returned_value = gleanerio(context,("release"), "cchdo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cchdo_uploadrelease(context):
- returned_value = post_to_graph("cchdo", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cchdo_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cchdo_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cchdo_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def cchdo_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cchdo"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def cchdo_upload_summarize(context):
- returned_value = post_to_graph("cchdo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cchdo"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cchdo"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cchdo():
- containers = cchdo_getImage()
- harvest = cchdo_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cchdo_missingreport_s3(start=harvest)
- report_idstat = cchdo_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cchdo_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cchdo")
- load_release = cchdo_naburelease(start=harvest)
- load_uploadrelease = cchdo_uploadrelease(start=load_release)
-
- load_prune = cchdo_nabu_prune(start=load_uploadrelease)
- load_prov = cchdo_nabuprov(start=load_prune)
- load_org = cchdo_nabuorg(start=load_prov)
-
- summarize = cchdo_summarize(start=load_uploadrelease)
- upload_summarize = cchdo_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = cchdo_missingreport_graph(start=summarize)
- report_graph = cchdo_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py
deleted file mode 100644
index 28d0129f..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def datadiscoverystudio_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "datadiscoverystudio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "datadiscoverystudio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "datadiscoverystudio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "datadiscoverystudio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_naburelease(context):
- returned_value = gleanerio(context,("release"), "datadiscoverystudio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_uploadrelease(context):
- returned_value = post_to_graph("datadiscoverystudio", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "datadiscoverystudio"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def datadiscoverystudio_upload_summarize(context):
- returned_value = post_to_graph("datadiscoverystudio",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="datadiscoverystudio"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="datadiscoverystudio"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_datadiscoverystudio():
- containers = datadiscoverystudio_getImage()
- harvest = datadiscoverystudio_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = datadiscoverystudio_missingreport_s3(start=harvest)
- report_idstat = datadiscoverystudio_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = datadiscoverystudio_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="datadiscoverystudio")
- load_release = datadiscoverystudio_naburelease(start=harvest)
- load_uploadrelease = datadiscoverystudio_uploadrelease(start=load_release)
-
- load_prune = datadiscoverystudio_nabu_prune(start=load_uploadrelease)
- load_prov = datadiscoverystudio_nabuprov(start=load_prune)
- load_org = datadiscoverystudio_nabuorg(start=load_prov)
-
- summarize = datadiscoverystudio_summarize(start=load_uploadrelease)
- upload_summarize = datadiscoverystudio_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = datadiscoverystudio_missingreport_graph(start=summarize)
- report_graph = datadiscoverystudio_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py
deleted file mode 100644
index 2859edf6..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def designsafe_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def designsafe_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "designsafe")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "designsafe")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "designsafe")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "designsafe")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_naburelease(context):
- returned_value = gleanerio(context,("release"), "designsafe")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def designsafe_uploadrelease(context):
- returned_value = post_to_graph("designsafe", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def designsafe_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def designsafe_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def designsafe_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def designsafe_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "designsafe"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def designsafe_upload_summarize(context):
- returned_value = post_to_graph("designsafe",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="designsafe"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="designsafe"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_designsafe():
- containers = designsafe_getImage()
- harvest = designsafe_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = designsafe_missingreport_s3(start=harvest)
- report_idstat = designsafe_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = designsafe_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="designsafe")
- load_release = designsafe_naburelease(start=harvest)
- load_uploadrelease = designsafe_uploadrelease(start=load_release)
-
- load_prune = designsafe_nabu_prune(start=load_uploadrelease)
- load_prov = designsafe_nabuprov(start=load_prune)
- load_org = designsafe_nabuorg(start=load_prov)
-
- summarize = designsafe_summarize(start=load_uploadrelease)
- upload_summarize = designsafe_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = designsafe_missingreport_graph(start=summarize)
- report_graph = designsafe_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py
deleted file mode 100644
index b9596fbb..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def earthchem_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def earthchem_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "earthchem")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "earthchem")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "earthchem")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "earthchem")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_naburelease(context):
- returned_value = gleanerio(context,("release"), "earthchem")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def earthchem_uploadrelease(context):
- returned_value = post_to_graph("earthchem", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def earthchem_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def earthchem_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def earthchem_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def earthchem_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "earthchem"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def earthchem_upload_summarize(context):
- returned_value = post_to_graph("earthchem",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="earthchem"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="earthchem"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_earthchem():
- containers = earthchem_getImage()
- harvest = earthchem_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = earthchem_missingreport_s3(start=harvest)
- report_idstat = earthchem_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = earthchem_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="earthchem")
- load_release = earthchem_naburelease(start=harvest)
- load_uploadrelease = earthchem_uploadrelease(start=load_release)
-
- load_prune = earthchem_nabu_prune(start=load_uploadrelease)
- load_prov = earthchem_nabuprov(start=load_prune)
- load_org = earthchem_nabuorg(start=load_prov)
-
- summarize = earthchem_summarize(start=load_uploadrelease)
- upload_summarize = earthchem_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = earthchem_missingreport_graph(start=summarize)
- report_graph = earthchem_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py
deleted file mode 100644
index 0532ae5a..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def geocodes_demo_datasets_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "geocodes_demo_datasets")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "geocodes_demo_datasets")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "geocodes_demo_datasets")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "geocodes_demo_datasets")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_naburelease(context):
- returned_value = gleanerio(context,("release"), "geocodes_demo_datasets")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_uploadrelease(context):
- returned_value = post_to_graph("geocodes_demo_datasets", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_demo_datasets"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_demo_datasets_upload_summarize(context):
- returned_value = post_to_graph("geocodes_demo_datasets",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="geocodes_demo_datasets"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="geocodes_demo_datasets"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_geocodes_demo_datasets():
- containers = geocodes_demo_datasets_getImage()
- harvest = geocodes_demo_datasets_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = geocodes_demo_datasets_missingreport_s3(start=harvest)
- report_idstat = geocodes_demo_datasets_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = geocodes_demo_datasets_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="geocodes_demo_datasets")
- load_release = geocodes_demo_datasets_naburelease(start=harvest)
- load_uploadrelease = geocodes_demo_datasets_uploadrelease(start=load_release)
-
- load_prune = geocodes_demo_datasets_nabu_prune(start=load_uploadrelease)
- load_prov = geocodes_demo_datasets_nabuprov(start=load_prune)
- load_org = geocodes_demo_datasets_nabuorg(start=load_prov)
-
- summarize = geocodes_demo_datasets_summarize(start=load_uploadrelease)
- upload_summarize = geocodes_demo_datasets_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = geocodes_demo_datasets_missingreport_graph(start=summarize)
- report_graph = geocodes_demo_datasets_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py
deleted file mode 100644
index 9aad7ec4..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def geocodes_examples_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "geocodes_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "geocodes_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "geocodes_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "geocodes_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_naburelease(context):
- returned_value = gleanerio(context,("release"), "geocodes_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_uploadrelease(context):
- returned_value = post_to_graph("geocodes_examples", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "geocodes_examples"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def geocodes_examples_upload_summarize(context):
- returned_value = post_to_graph("geocodes_examples",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="geocodes_examples"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="geocodes_examples"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_geocodes_examples():
- containers = geocodes_examples_getImage()
- harvest = geocodes_examples_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = geocodes_examples_missingreport_s3(start=harvest)
- report_idstat = geocodes_examples_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = geocodes_examples_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="geocodes_examples")
- load_release = geocodes_examples_naburelease(start=harvest)
- load_uploadrelease = geocodes_examples_uploadrelease(start=load_release)
-
- load_prune = geocodes_examples_nabu_prune(start=load_uploadrelease)
- load_prov = geocodes_examples_nabuprov(start=load_prune)
- load_org = geocodes_examples_nabuorg(start=load_prov)
-
- summarize = geocodes_examples_summarize(start=load_uploadrelease)
- upload_summarize = geocodes_examples_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = geocodes_examples_missingreport_graph(start=summarize)
- report_graph = geocodes_examples_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py
deleted file mode 100644
index 8adcd2c9..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hydroshare_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hydroshare_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hydroshare")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hydroshare")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hydroshare")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hydroshare")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_naburelease(context):
- returned_value = gleanerio(context,("release"), "hydroshare")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydroshare_uploadrelease(context):
- returned_value = post_to_graph("hydroshare", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydroshare_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydroshare_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydroshare"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def hydroshare_upload_summarize(context):
- returned_value = post_to_graph("hydroshare",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hydroshare"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hydroshare"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hydroshare():
- containers = hydroshare_getImage()
- harvest = hydroshare_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hydroshare_missingreport_s3(start=harvest)
- report_idstat = hydroshare_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hydroshare_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hydroshare")
- load_release = hydroshare_naburelease(start=harvest)
- load_uploadrelease = hydroshare_uploadrelease(start=load_release)
-
- load_prune = hydroshare_nabu_prune(start=load_uploadrelease)
- load_prov = hydroshare_nabuprov(start=load_prune)
- load_org = hydroshare_nabuorg(start=load_prov)
-
- summarize = hydroshare_summarize(start=load_uploadrelease)
- upload_summarize = hydroshare_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = hydroshare_missingreport_graph(start=summarize)
- report_graph = hydroshare_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py
deleted file mode 100644
index 150ac2de..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def iedadata_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def iedadata_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "iedadata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "iedadata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "iedadata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "iedadata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_naburelease(context):
- returned_value = gleanerio(context,("release"), "iedadata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iedadata_uploadrelease(context):
- returned_value = post_to_graph("iedadata", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def iedadata_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iedadata_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iedadata_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def iedadata_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iedadata"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def iedadata_upload_summarize(context):
- returned_value = post_to_graph("iedadata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="iedadata"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="iedadata"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_iedadata():
- containers = iedadata_getImage()
- harvest = iedadata_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = iedadata_missingreport_s3(start=harvest)
- report_idstat = iedadata_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = iedadata_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="iedadata")
- load_release = iedadata_naburelease(start=harvest)
- load_uploadrelease = iedadata_uploadrelease(start=load_release)
-
- load_prune = iedadata_nabu_prune(start=load_uploadrelease)
- load_prov = iedadata_nabuprov(start=load_prune)
- load_org = iedadata_nabuorg(start=load_prov)
-
- summarize = iedadata_summarize(start=load_uploadrelease)
- upload_summarize = iedadata_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = iedadata_missingreport_graph(start=summarize)
- report_graph = iedadata_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py
deleted file mode 100644
index e10f7586..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def iris_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def iris_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "iris")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "iris")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "iris")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "iris")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_naburelease(context):
- returned_value = gleanerio(context,("release"), "iris")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iris_uploadrelease(context):
- returned_value = post_to_graph("iris", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def iris_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iris_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def iris_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def iris_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "iris"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def iris_upload_summarize(context):
- returned_value = post_to_graph("iris",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="iris"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="iris"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_iris():
- containers = iris_getImage()
- harvest = iris_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = iris_missingreport_s3(start=harvest)
- report_idstat = iris_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = iris_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="iris")
- load_release = iris_naburelease(start=harvest)
- load_uploadrelease = iris_uploadrelease(start=load_release)
-
- load_prune = iris_nabu_prune(start=load_uploadrelease)
- load_prov = iris_nabuprov(start=load_prune)
- load_org = iris_nabuorg(start=load_prov)
-
- summarize = iris_summarize(start=load_uploadrelease)
- upload_summarize = iris_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = iris_missingreport_graph(start=summarize)
- report_graph = iris_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py
deleted file mode 100644
index df495780..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def linkedearth_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def linkedearth_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "linkedearth")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "linkedearth")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "linkedearth")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "linkedearth")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_naburelease(context):
- returned_value = gleanerio(context,("release"), "linkedearth")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def linkedearth_uploadrelease(context):
- returned_value = post_to_graph("linkedearth", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def linkedearth_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def linkedearth_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "linkedearth"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def linkedearth_upload_summarize(context):
- returned_value = post_to_graph("linkedearth",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="linkedearth"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="linkedearth"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_linkedearth():
- containers = linkedearth_getImage()
- harvest = linkedearth_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = linkedearth_missingreport_s3(start=harvest)
- report_idstat = linkedearth_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = linkedearth_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="linkedearth")
- load_release = linkedearth_naburelease(start=harvest)
- load_uploadrelease = linkedearth_uploadrelease(start=load_release)
-
- load_prune = linkedearth_nabu_prune(start=load_uploadrelease)
- load_prov = linkedearth_nabuprov(start=load_prune)
- load_org = linkedearth_nabuorg(start=load_prov)
-
- summarize = linkedearth_summarize(start=load_uploadrelease)
- upload_summarize = linkedearth_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = linkedearth_missingreport_graph(start=summarize)
- report_graph = linkedearth_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py
deleted file mode 100644
index 020a942c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def lipdverse_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def lipdverse_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "lipdverse")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "lipdverse")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "lipdverse")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "lipdverse")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_naburelease(context):
- returned_value = gleanerio(context,("release"), "lipdverse")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def lipdverse_uploadrelease(context):
- returned_value = post_to_graph("lipdverse", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def lipdverse_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def lipdverse_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "lipdverse"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def lipdverse_upload_summarize(context):
- returned_value = post_to_graph("lipdverse",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="lipdverse"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="lipdverse"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_lipdverse():
- containers = lipdverse_getImage()
- harvest = lipdverse_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = lipdverse_missingreport_s3(start=harvest)
- report_idstat = lipdverse_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = lipdverse_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="lipdverse")
- load_release = lipdverse_naburelease(start=harvest)
- load_uploadrelease = lipdverse_uploadrelease(start=load_release)
-
- load_prune = lipdverse_nabu_prune(start=load_uploadrelease)
- load_prov = lipdverse_nabuprov(start=load_prune)
- load_org = lipdverse_nabuorg(start=load_prov)
-
- summarize = lipdverse_summarize(start=load_uploadrelease)
- upload_summarize = lipdverse_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = lipdverse_missingreport_graph(start=summarize)
- report_graph = lipdverse_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py
deleted file mode 100644
index 2ade2cd2..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def magic_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def magic_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "magic")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "magic")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "magic")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "magic")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_naburelease(context):
- returned_value = gleanerio(context,("release"), "magic")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def magic_uploadrelease(context):
- returned_value = post_to_graph("magic", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def magic_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def magic_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def magic_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def magic_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "magic"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def magic_upload_summarize(context):
- returned_value = post_to_graph("magic",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="magic"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="magic"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_magic():
- containers = magic_getImage()
- harvest = magic_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = magic_missingreport_s3(start=harvest)
- report_idstat = magic_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = magic_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="magic")
- load_release = magic_naburelease(start=harvest)
- load_uploadrelease = magic_uploadrelease(start=load_release)
-
- load_prune = magic_nabu_prune(start=load_uploadrelease)
- load_prov = magic_nabuprov(start=load_prune)
- load_org = magic_nabuorg(start=load_prov)
-
- summarize = magic_summarize(start=load_uploadrelease)
- upload_summarize = magic_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = magic_missingreport_graph(start=summarize)
- report_graph = magic_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py
deleted file mode 100644
index b8376c16..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def neon_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def neon_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_naburelease(context):
- returned_value = gleanerio(context,("release"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neon_uploadrelease(context):
- returned_value = post_to_graph("neon", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def neon_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neon_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neon_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def neon_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neon"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def neon_upload_summarize(context):
- returned_value = post_to_graph("neon",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="neon"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="neon"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_neon():
- containers = neon_getImage()
- harvest = neon_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = neon_missingreport_s3(start=harvest)
- report_idstat = neon_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = neon_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="neon")
- load_release = neon_naburelease(start=harvest)
- load_uploadrelease = neon_uploadrelease(start=load_release)
-
- load_prune = neon_nabu_prune(start=load_uploadrelease)
- load_prov = neon_nabuprov(start=load_prune)
- load_org = neon_nabuorg(start=load_prov)
-
- summarize = neon_summarize(start=load_uploadrelease)
- upload_summarize = neon_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = neon_missingreport_graph(start=summarize)
- report_graph = neon_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py
deleted file mode 100644
index cb268e7d..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def neotomadb_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def neotomadb_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "neotomadb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "neotomadb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "neotomadb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "neotomadb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_naburelease(context):
- returned_value = gleanerio(context,("release"), "neotomadb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neotomadb_uploadrelease(context):
- returned_value = post_to_graph("neotomadb", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neotomadb_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def neotomadb_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "neotomadb"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def neotomadb_upload_summarize(context):
- returned_value = post_to_graph("neotomadb",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="neotomadb"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="neotomadb"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_neotomadb():
- containers = neotomadb_getImage()
- harvest = neotomadb_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = neotomadb_missingreport_s3(start=harvest)
- report_idstat = neotomadb_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = neotomadb_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="neotomadb")
- load_release = neotomadb_naburelease(start=harvest)
- load_uploadrelease = neotomadb_uploadrelease(start=load_release)
-
- load_prune = neotomadb_nabu_prune(start=load_uploadrelease)
- load_prov = neotomadb_nabuprov(start=load_prune)
- load_org = neotomadb_nabuorg(start=load_prov)
-
- summarize = neotomadb_summarize(start=load_uploadrelease)
- upload_summarize = neotomadb_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = neotomadb_missingreport_graph(start=summarize)
- report_graph = neotomadb_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py
deleted file mode 100644
index 1d84c757..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def opencoredata_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def opencoredata_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "opencoredata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "opencoredata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "opencoredata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "opencoredata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_naburelease(context):
- returned_value = gleanerio(context,("release"), "opencoredata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opencoredata_uploadrelease(context):
- returned_value = post_to_graph("opencoredata", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opencoredata_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opencoredata_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opencoredata"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def opencoredata_upload_summarize(context):
- returned_value = post_to_graph("opencoredata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="opencoredata"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="opencoredata"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_opencoredata():
- containers = opencoredata_getImage()
- harvest = opencoredata_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = opencoredata_missingreport_s3(start=harvest)
- report_idstat = opencoredata_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = opencoredata_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="opencoredata")
- load_release = opencoredata_naburelease(start=harvest)
- load_uploadrelease = opencoredata_uploadrelease(start=load_release)
-
- load_prune = opencoredata_nabu_prune(start=load_uploadrelease)
- load_prov = opencoredata_nabuprov(start=load_prune)
- load_org = opencoredata_nabuorg(start=load_prov)
-
- summarize = opencoredata_summarize(start=load_uploadrelease)
- upload_summarize = opencoredata_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = opencoredata_missingreport_graph(start=summarize)
- report_graph = opencoredata_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py
deleted file mode 100644
index 4c82314d..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def opentopography_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def opentopography_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "opentopography")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "opentopography")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "opentopography")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "opentopography")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_naburelease(context):
- returned_value = gleanerio(context,("release"), "opentopography")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opentopography_uploadrelease(context):
- returned_value = post_to_graph("opentopography", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def opentopography_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opentopography_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def opentopography_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def opentopography_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "opentopography"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def opentopography_upload_summarize(context):
- returned_value = post_to_graph("opentopography",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="opentopography"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="opentopography"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_opentopography():
- containers = opentopography_getImage()
- harvest = opentopography_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = opentopography_missingreport_s3(start=harvest)
- report_idstat = opentopography_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = opentopography_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="opentopography")
- load_release = opentopography_naburelease(start=harvest)
- load_uploadrelease = opentopography_uploadrelease(start=load_release)
-
- load_prune = opentopography_nabu_prune(start=load_uploadrelease)
- load_prov = opentopography_nabuprov(start=load_prune)
- load_org = opentopography_nabuorg(start=load_prov)
-
- summarize = opentopography_summarize(start=load_uploadrelease)
- upload_summarize = opentopography_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = opentopography_missingreport_graph(start=summarize)
- report_graph = opentopography_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py
deleted file mode 100644
index 239da8c1..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def resource_registry_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def resource_registry_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "resource_registry")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "resource_registry")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "resource_registry")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "resource_registry")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_naburelease(context):
- returned_value = gleanerio(context,("release"), "resource_registry")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def resource_registry_uploadrelease(context):
- returned_value = post_to_graph("resource_registry", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def resource_registry_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def resource_registry_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "resource_registry"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def resource_registry_upload_summarize(context):
- returned_value = post_to_graph("resource_registry",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="resource_registry"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="resource_registry"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_resource_registry():
- containers = resource_registry_getImage()
- harvest = resource_registry_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = resource_registry_missingreport_s3(start=harvest)
- report_idstat = resource_registry_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = resource_registry_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="resource_registry")
- load_release = resource_registry_naburelease(start=harvest)
- load_uploadrelease = resource_registry_uploadrelease(start=load_release)
-
- load_prune = resource_registry_nabu_prune(start=load_uploadrelease)
- load_prov = resource_registry_nabuprov(start=load_prune)
- load_org = resource_registry_nabuorg(start=load_prov)
-
- summarize = resource_registry_summarize(start=load_uploadrelease)
- upload_summarize = resource_registry_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = resource_registry_missingreport_graph(start=summarize)
- report_graph = resource_registry_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py
deleted file mode 100644
index 1e89ce42..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def ssdbiodp_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "ssdbiodp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "ssdbiodp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "ssdbiodp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "ssdbiodp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_naburelease(context):
- returned_value = gleanerio(context,("release"), "ssdbiodp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_uploadrelease(context):
- returned_value = post_to_graph("ssdbiodp", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ssdbiodp"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def ssdbiodp_upload_summarize(context):
- returned_value = post_to_graph("ssdbiodp",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="ssdbiodp"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="ssdbiodp"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_ssdbiodp():
- containers = ssdbiodp_getImage()
- harvest = ssdbiodp_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = ssdbiodp_missingreport_s3(start=harvest)
- report_idstat = ssdbiodp_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = ssdbiodp_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="ssdbiodp")
- load_release = ssdbiodp_naburelease(start=harvest)
- load_uploadrelease = ssdbiodp_uploadrelease(start=load_release)
-
- load_prune = ssdbiodp_nabu_prune(start=load_uploadrelease)
- load_prov = ssdbiodp_nabuprov(start=load_prune)
- load_org = ssdbiodp_nabuorg(start=load_prov)
-
- summarize = ssdbiodp_summarize(start=load_uploadrelease)
- upload_summarize = ssdbiodp_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = ssdbiodp_missingreport_graph(start=summarize)
- report_graph = ssdbiodp_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py
deleted file mode 100644
index 444799dd..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def ucar_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def ucar_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "ucar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "ucar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "ucar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "ucar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_naburelease(context):
- returned_value = gleanerio(context,("release"), "ucar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ucar_uploadrelease(context):
- returned_value = post_to_graph("ucar", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def ucar_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ucar_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ucar_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def ucar_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ucar"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def ucar_upload_summarize(context):
- returned_value = post_to_graph("ucar",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="ucar"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="ucar"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_ucar():
- containers = ucar_getImage()
- harvest = ucar_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = ucar_missingreport_s3(start=harvest)
- report_idstat = ucar_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = ucar_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="ucar")
- load_release = ucar_naburelease(start=harvest)
- load_uploadrelease = ucar_uploadrelease(start=load_release)
-
- load_prune = ucar_nabu_prune(start=load_uploadrelease)
- load_prov = ucar_nabuprov(start=load_prune)
- load_org = ucar_nabuorg(start=load_prov)
-
- summarize = ucar_summarize(start=load_uploadrelease)
- upload_summarize = ucar_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = ucar_missingreport_graph(start=summarize)
- report_graph = ucar_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py
deleted file mode 100644
index df1085ae..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def unavco_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def unavco_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "unavco")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "unavco")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "unavco")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "unavco")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_naburelease(context):
- returned_value = gleanerio(context,("release"), "unavco")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unavco_uploadrelease(context):
- returned_value = post_to_graph("unavco", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def unavco_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unavco_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unavco_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def unavco_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unavco"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def unavco_upload_summarize(context):
- returned_value = post_to_graph("unavco",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="unavco"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="unavco"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_unavco():
- containers = unavco_getImage()
- harvest = unavco_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = unavco_missingreport_s3(start=harvest)
- report_idstat = unavco_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = unavco_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="unavco")
- load_release = unavco_naburelease(start=harvest)
- load_uploadrelease = unavco_uploadrelease(start=load_release)
-
- load_prune = unavco_nabu_prune(start=load_uploadrelease)
- load_prov = unavco_nabuprov(start=load_prune)
- load_org = unavco_nabuorg(start=load_prov)
-
- summarize = unavco_summarize(start=load_uploadrelease)
- upload_summarize = unavco_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = unavco_missingreport_graph(start=summarize)
- report_graph = unavco_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py
deleted file mode 100644
index 5af2311e..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def unidata_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def unidata_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "unidata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "unidata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "unidata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "unidata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_naburelease(context):
- returned_value = gleanerio(context,("release"), "unidata")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unidata_uploadrelease(context):
- returned_value = post_to_graph("unidata", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def unidata_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unidata_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def unidata_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def unidata_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "unidata"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def unidata_upload_summarize(context):
- returned_value = post_to_graph("unidata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="unidata"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="unidata"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_unidata():
- containers = unidata_getImage()
- harvest = unidata_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = unidata_missingreport_s3(start=harvest)
- report_idstat = unidata_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = unidata_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="unidata")
- load_release = unidata_naburelease(start=harvest)
- load_uploadrelease = unidata_uploadrelease(start=load_release)
-
- load_prune = unidata_nabu_prune(start=load_uploadrelease)
- load_prov = unidata_nabuprov(start=load_prune)
- load_org = unidata_nabuorg(start=load_prov)
-
- summarize = unidata_summarize(start=load_uploadrelease)
- upload_summarize = unidata_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = unidata_missingreport_graph(start=summarize)
- report_graph = unidata_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py
deleted file mode 100644
index ed2f7e9c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def usapdc_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def usapdc_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "usapdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "usapdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "usapdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "usapdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_naburelease(context):
- returned_value = gleanerio(context,("release"), "usapdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def usapdc_uploadrelease(context):
- returned_value = post_to_graph("usapdc", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def usapdc_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def usapdc_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def usapdc_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def usapdc_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "usapdc"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def usapdc_upload_summarize(context):
- returned_value = post_to_graph("usapdc",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="usapdc"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="usapdc"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_usapdc():
- containers = usapdc_getImage()
- harvest = usapdc_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = usapdc_missingreport_s3(start=harvest)
- report_idstat = usapdc_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = usapdc_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="usapdc")
- load_release = usapdc_naburelease(start=harvest)
- load_uploadrelease = usapdc_uploadrelease(start=load_release)
-
- load_prune = usapdc_nabu_prune(start=load_uploadrelease)
- load_prov = usapdc_nabuprov(start=load_prune)
- load_org = usapdc_nabuorg(start=load_prov)
-
- summarize = usapdc_summarize(start=load_uploadrelease)
- upload_summarize = usapdc_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = usapdc_missingreport_graph(start=summarize)
- report_graph = usapdc_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py
deleted file mode 100644
index 60f465bc..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py
+++ /dev/null
@@ -1,819 +0,0 @@
-import distutils
-import logging
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
-from ec.graph.manageGraph import ManageBlazegraph as mg
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# #
-# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
-# WHEN RUNNING dagster-dev, this needs to be a path to a local file
-##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
-
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
-
-SUMMARY_PATH = 'graphs/summary'
-RELEASE_PATH = 'graphs/latest'
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-def _graphSummaryEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql"
- return url
-def _pythonMinioAddress(url, port = None):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- if port is not None:
- PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT )
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
-
- server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
- bucket = GLEANER_MINIO_BUCKET
- release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
- # BLAZEGRAPH SPECIFIC
- # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- # r = requests.post(url)
- # log.debug(f' status:{r.status_code}') # status:404
- # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- # if r.status_code == 200:
- # # ''
- # if 'data modified="0"' in r.text:
- # get_dagster_logger().info(f'graph: no data inserted ')
- # raise Exception("No Data Added: " + r.text)
- # return True
- # else:
- # get_dagster_logger().info(f'graph: error')
- # raise Exception(f' graph: insert failed: status:{r.status_code}')
-
- ### GENERIC LOAD FROM
- url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- loadfrom = {'update': f'LOAD <{release_url}>'}
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- r = requests.post(url, headers=headers, data=loadfrom )
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
- if r.status_code == 200:
- get_dagster_logger().info(f'graph load response: {str(r.text)} ')
- # ''
- if 'mutationCount=0' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- #raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error {str(r.text)}')
- raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wifire_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wifire_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wifire")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wifire")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wifire")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wifire")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_naburelease(context):
- returned_value = gleanerio(context,("release"), "wifire")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wifire_uploadrelease(context):
- returned_value = post_to_graph("wifire", extension="nq")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wifire_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wifire_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wifire_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire")
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-class S3ObjectInfo:
- bucket_name=""
- object_name=""
-
-@op(ins={"start": In(Nothing)})
-def wifire_summarize(context) :
- s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wifire"
- endpoint = _graphEndpoint() # getting data, not uploading data
- summary_namespace = _graphSummaryEndpoint()
-
-
- try:
-
- summarydf = get_summary4repoSubset(endpoint, source_name)
- nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
- summaryttl = g.serialize(format='longturtle')
- # Lets always write out file to s3, and insert as a separate process
- # we might be able to make this an asset..., but would need to be acessible by http
- # if not stored in s3
- objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
- s3ObjectInfo= S3ObjectInfo()
- s3ObjectInfo.bucket_name=bucket
- s3ObjectInfo.object_name=objectname
-
- s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo )
- #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
- #if not inserted:
- # raise Exception("Loading to graph failed.")
- except Exception as e:
- # use dagster logger
- get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
- raise Exception(f"Loading Summary graph failed. {str(e)}")
- return 1
-
- return
-
-@op(ins={"start": In(Nothing)})
-def wifire_upload_summarize(context):
- returned_value = post_to_graph("wifire",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload summary returned {r} ")
- return
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wifire"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wifire"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wifire():
- containers = wifire_getImage()
- harvest = wifire_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wifire_missingreport_s3(start=harvest)
- report_idstat = wifire_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wifire_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wifire")
- load_release = wifire_naburelease(start=harvest)
- load_uploadrelease = wifire_uploadrelease(start=load_release)
-
- load_prune = wifire_nabu_prune(start=load_uploadrelease)
- load_prov = wifire_nabuprov(start=load_prune)
- load_org = wifire_nabuorg(start=load_prov)
-
- summarize = wifire_summarize(start=load_uploadrelease)
- upload_summarize = wifire_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = wifire_missingreport_graph(start=summarize)
- report_graph = wifire_graph_reports(start=report_msgraph)
-
-
-
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py
deleted file mode 100644
index e77f2141..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from dagster import repository
-from jobs.implnet_jobs_amgeo import implnet_job_amgeo
-from sch.implnet_sch_amgeo import implnet_sch_amgeo
-from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs
-from sch.implnet_sch_aquadocs import implnet_sch_aquadocs
-from jobs.implnet_jobs_bcodmo import implnet_job_bcodmo
-from sch.implnet_sch_bcodmo import implnet_sch_bcodmo
-from jobs.implnet_jobs_cchdo import implnet_job_cchdo
-from sch.implnet_sch_cchdo import implnet_sch_cchdo
-from jobs.implnet_jobs_datadiscoverystudio import implnet_job_datadiscoverystudio
-from sch.implnet_sch_datadiscoverystudio import implnet_sch_datadiscoverystudio
-from jobs.implnet_jobs_designsafe import implnet_job_designsafe
-from sch.implnet_sch_designsafe import implnet_sch_designsafe
-from jobs.implnet_jobs_earthchem import implnet_job_earthchem
-from sch.implnet_sch_earthchem import implnet_sch_earthchem
-from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples
-from sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples
-from jobs.implnet_jobs_edi import implnet_job_edi
-from sch.implnet_sch_edi import implnet_sch_edi
-from jobs.implnet_jobs_geocodes_demo_datasets import implnet_job_geocodes_demo_datasets
-from sch.implnet_sch_geocodes_demo_datasets import implnet_sch_geocodes_demo_datasets
-from jobs.implnet_jobs_geocodes_examples import implnet_job_geocodes_examples
-from sch.implnet_sch_geocodes_examples import implnet_sch_geocodes_examples
-from jobs.implnet_jobs_hydroshare import implnet_job_hydroshare
-from sch.implnet_sch_hydroshare import implnet_sch_hydroshare
-from jobs.implnet_jobs_iedadata import implnet_job_iedadata
-from sch.implnet_sch_iedadata import implnet_sch_iedadata
-from jobs.implnet_jobs_iris import implnet_job_iris
-from sch.implnet_sch_iris import implnet_sch_iris
-from jobs.implnet_jobs_linkedearth import implnet_job_linkedearth
-from sch.implnet_sch_linkedearth import implnet_sch_linkedearth
-from jobs.implnet_jobs_lipdverse import implnet_job_lipdverse
-from sch.implnet_sch_lipdverse import implnet_sch_lipdverse
-from jobs.implnet_jobs_magic import implnet_job_magic
-from sch.implnet_sch_magic import implnet_sch_magic
-from jobs.implnet_jobs_neon import implnet_job_neon
-from sch.implnet_sch_neon import implnet_sch_neon
-from jobs.implnet_jobs_neotomadb import implnet_job_neotomadb
-from sch.implnet_sch_neotomadb import implnet_sch_neotomadb
-from jobs.implnet_jobs_opencoredata import implnet_job_opencoredata
-from sch.implnet_sch_opencoredata import implnet_sch_opencoredata
-from jobs.implnet_jobs_opentopography import implnet_job_opentopography
-from sch.implnet_sch_opentopography import implnet_sch_opentopography
-from jobs.implnet_jobs_r2r import implnet_job_r2r
-from sch.implnet_sch_r2r import implnet_sch_r2r
-from jobs.implnet_jobs_resource_registry import implnet_job_resource_registry
-from sch.implnet_sch_resource_registry import implnet_sch_resource_registry
-from jobs.implnet_jobs_ssdbiodp import implnet_job_ssdbiodp
-from sch.implnet_sch_ssdbiodp import implnet_sch_ssdbiodp
-from jobs.implnet_jobs_ucar import implnet_job_ucar
-from sch.implnet_sch_ucar import implnet_sch_ucar
-from jobs.implnet_jobs_unavco import implnet_job_unavco
-from sch.implnet_sch_unavco import implnet_sch_unavco
-from jobs.implnet_jobs_unidata import implnet_job_unidata
-from sch.implnet_sch_unidata import implnet_sch_unidata
-from jobs.implnet_jobs_usapdc import implnet_job_usapdc
-from sch.implnet_sch_usapdc import implnet_sch_usapdc
-from jobs.implnet_jobs_wifire import implnet_job_wifire
-from sch.implnet_sch_wifire import implnet_sch_wifire
-
-@repository
-def gleaner():
- jobs = [implnet_job_amgeo, implnet_job_aquadocs, implnet_job_bcodmo, implnet_job_cchdo, implnet_job_datadiscoverystudio, implnet_job_designsafe, implnet_job_earthchem, implnet_job_ecrr_examples, implnet_job_edi, implnet_job_geocodes_demo_datasets, implnet_job_geocodes_examples, implnet_job_hydroshare, implnet_job_iedadata, implnet_job_iris, implnet_job_linkedearth, implnet_job_lipdverse, implnet_job_magic, implnet_job_neon, implnet_job_neotomadb, implnet_job_opencoredata, implnet_job_opentopography, implnet_job_r2r, implnet_job_resource_registry, implnet_job_ssdbiodp, implnet_job_ucar, implnet_job_unavco, implnet_job_unidata, implnet_job_usapdc, implnet_job_wifire]
- schedules = [implnet_sch_amgeo, implnet_sch_aquadocs, implnet_sch_bcodmo, implnet_sch_cchdo, implnet_sch_datadiscoverystudio, implnet_sch_designsafe, implnet_sch_earthchem, implnet_sch_ecrr_examples, implnet_sch_edi, implnet_sch_geocodes_demo_datasets, implnet_sch_geocodes_examples, implnet_sch_hydroshare, implnet_sch_iedadata, implnet_sch_iris, implnet_sch_linkedearth, implnet_sch_lipdverse, implnet_sch_magic, implnet_sch_neon, implnet_sch_neotomadb, implnet_sch_opencoredata, implnet_sch_opentopography, implnet_sch_r2r, implnet_sch_resource_registry, implnet_sch_ssdbiodp, implnet_sch_ucar, implnet_sch_unavco, implnet_sch_unidata, implnet_sch_usapdc, implnet_sch_wifire]
-
-
- return jobs + schedules
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py
deleted file mode 100644
index 8e861599..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_amgeo import implnet_job_amgeo
-
-@schedule(cron_schedule="0 0 1 * *", job=implnet_job_amgeo, execution_timezone="US/Central")
-def implnet_sch_amgeo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py
deleted file mode 100644
index 00c1bf64..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs
-
-@schedule(cron_schedule="0 6 1 * *", job=implnet_job_aquadocs, execution_timezone="US/Central")
-def implnet_sch_aquadocs(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py
deleted file mode 100644
index f45ed457..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_bcodmo import implnet_job_bcodmo
-
-@schedule(cron_schedule="0 12 1 * *", job=implnet_job_bcodmo, execution_timezone="US/Central")
-def implnet_sch_bcodmo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py
deleted file mode 100644
index 09ae1e29..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cchdo import implnet_job_cchdo
-
-@schedule(cron_schedule="0 18 1 * *", job=implnet_job_cchdo, execution_timezone="US/Central")
-def implnet_sch_cchdo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py
deleted file mode 100644
index 2d15eb65..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_datadiscoverystudio import implnet_job_datadiscoverystudio
-
-@schedule(cron_schedule="0 0 2 * *", job=implnet_job_datadiscoverystudio, execution_timezone="US/Central")
-def implnet_sch_datadiscoverystudio(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py
deleted file mode 100644
index 119c6b48..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_designsafe import implnet_job_designsafe
-
-@schedule(cron_schedule="0 6 2 * *", job=implnet_job_designsafe, execution_timezone="US/Central")
-def implnet_sch_designsafe(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py
deleted file mode 100644
index c7860c4d..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_earthchem import implnet_job_earthchem
-
-@schedule(cron_schedule="0 12 2 * *", job=implnet_job_earthchem, execution_timezone="US/Central")
-def implnet_sch_earthchem(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py
deleted file mode 100644
index 0a6d6ffa..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples
-
-@schedule(cron_schedule="0 18 2 * *", job=implnet_job_ecrr_examples, execution_timezone="US/Central")
-def implnet_sch_ecrr_examples(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py
deleted file mode 100644
index a864502d..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_edi import implnet_job_edi
-
-@schedule(cron_schedule="0 0 3 * *", job=implnet_job_edi, execution_timezone="US/Central")
-def implnet_sch_edi(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py
deleted file mode 100644
index 092d312f..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_geocodes_demo_datasets import implnet_job_geocodes_demo_datasets
-
-@schedule(cron_schedule="0 6 3 * *", job=implnet_job_geocodes_demo_datasets, execution_timezone="US/Central")
-def implnet_sch_geocodes_demo_datasets(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py
deleted file mode 100644
index 2e409a45..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_geocodes_examples import implnet_job_geocodes_examples
-
-@schedule(cron_schedule="0 12 3 * *", job=implnet_job_geocodes_examples, execution_timezone="US/Central")
-def implnet_sch_geocodes_examples(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py
deleted file mode 100644
index 8fd85c67..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hydroshare import implnet_job_hydroshare
-
-@schedule(cron_schedule="0 18 3 * *", job=implnet_job_hydroshare, execution_timezone="US/Central")
-def implnet_sch_hydroshare(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py
deleted file mode 100644
index 6010ae2c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_iedadata import implnet_job_iedadata
-
-@schedule(cron_schedule="0 0 4 * *", job=implnet_job_iedadata, execution_timezone="US/Central")
-def implnet_sch_iedadata(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py
deleted file mode 100644
index d65602b7..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_iris import implnet_job_iris
-
-@schedule(cron_schedule="0 6 4 * *", job=implnet_job_iris, execution_timezone="US/Central")
-def implnet_sch_iris(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py
deleted file mode 100644
index 5ba7776b..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_linkedearth import implnet_job_linkedearth
-
-@schedule(cron_schedule="0 12 4 * *", job=implnet_job_linkedearth, execution_timezone="US/Central")
-def implnet_sch_linkedearth(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py
deleted file mode 100644
index 3483de64..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_lipdverse import implnet_job_lipdverse
-
-@schedule(cron_schedule="0 18 4 * *", job=implnet_job_lipdverse, execution_timezone="US/Central")
-def implnet_sch_lipdverse(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py
deleted file mode 100644
index 62859a3c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_magic import implnet_job_magic
-
-@schedule(cron_schedule="0 0 5 * *", job=implnet_job_magic, execution_timezone="US/Central")
-def implnet_sch_magic(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py
deleted file mode 100644
index 5ac234b6..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_neon import implnet_job_neon
-
-@schedule(cron_schedule="0 6 5 * *", job=implnet_job_neon, execution_timezone="US/Central")
-def implnet_sch_neon(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py
deleted file mode 100644
index 1e928579..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_neotomadb import implnet_job_neotomadb
-
-@schedule(cron_schedule="0 12 5 * *", job=implnet_job_neotomadb, execution_timezone="US/Central")
-def implnet_sch_neotomadb(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py
deleted file mode 100644
index 626f7e9c..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_opencoredata import implnet_job_opencoredata
-
-@schedule(cron_schedule="0 18 5 * *", job=implnet_job_opencoredata, execution_timezone="US/Central")
-def implnet_sch_opencoredata(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py
deleted file mode 100644
index 3bc4a32b..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_opentopography import implnet_job_opentopography
-
-@schedule(cron_schedule="0 0 6 * *", job=implnet_job_opentopography, execution_timezone="US/Central")
-def implnet_sch_opentopography(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py
deleted file mode 100644
index 132e6a59..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_r2r import implnet_job_r2r
-
-@schedule(cron_schedule="0 6 6 * *", job=implnet_job_r2r, execution_timezone="US/Central")
-def implnet_sch_r2r(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py
deleted file mode 100644
index 09713367..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_resource_registry import implnet_job_resource_registry
-
-@schedule(cron_schedule="0 12 6 * *", job=implnet_job_resource_registry, execution_timezone="US/Central")
-def implnet_sch_resource_registry(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py
deleted file mode 100644
index c850fa6e..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ssdbiodp import implnet_job_ssdbiodp
-
-@schedule(cron_schedule="0 18 6 * *", job=implnet_job_ssdbiodp, execution_timezone="US/Central")
-def implnet_sch_ssdbiodp(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py
deleted file mode 100644
index 55161270..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ucar import implnet_job_ucar
-
-@schedule(cron_schedule="0 0 7 * *", job=implnet_job_ucar, execution_timezone="US/Central")
-def implnet_sch_ucar(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py
deleted file mode 100644
index fb6cbbfc..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_unavco import implnet_job_unavco
-
-@schedule(cron_schedule="0 6 7 * *", job=implnet_job_unavco, execution_timezone="US/Central")
-def implnet_sch_unavco(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py
deleted file mode 100644
index 8915a422..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_unidata import implnet_job_unidata
-
-@schedule(cron_schedule="0 12 7 * *", job=implnet_job_unidata, execution_timezone="US/Central")
-def implnet_sch_unidata(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py
deleted file mode 100644
index 170dfba5..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_usapdc import implnet_job_usapdc
-
-@schedule(cron_schedule="0 18 7 * *", job=implnet_job_usapdc, execution_timezone="US/Central")
-def implnet_sch_usapdc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py
deleted file mode 100644
index 746d27a1..00000000
--- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wifire import implnet_job_wifire
-
-@schedule(cron_schedule="0 0 1 * *", job=implnet_job_wifire, execution_timezone="US/Central")
-def implnet_sch_wifire(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py
deleted file mode 100644
index 985bc623..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_aiannh0 import harvest_aiannh0
-
-@job
-def implnet_job_aiannh0():
- harvest_aiannh0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py
deleted file mode 100644
index 7febeb1e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_autotest10 import harvest_autotest10
-
-@job
-def implnet_job_autotest10():
- harvest_autotest10()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py
deleted file mode 100644
index 8d68d11f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_autotest20 import harvest_autotest20
-
-@job
-def implnet_job_autotest20():
- harvest_autotest20()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py
deleted file mode 100644
index 79278858..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cagagespids0 import harvest_cagagespids0
-
-@job
-def implnet_job_cagagespids0():
- harvest_cagagespids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py
deleted file mode 100644
index 0023ea51..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cbsa0 import harvest_cbsa0
-
-@job
-def implnet_job_cbsa0():
- harvest_cbsa0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py
deleted file mode 100644
index e0b87163..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_chyldpilotids0 import harvest_chyldpilotids0
-
-@job
-def implnet_job_chyldpilotids0():
- harvest_chyldpilotids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py
deleted file mode 100644
index b6d2feab..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_counties0 import harvest_counties0
-
-@job
-def implnet_job_counties0():
- harvest_counties0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py
deleted file mode 100644
index e8397930..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisandrewsforestlterids0 import harvest_cuahsihisandrewsforestlterids0
-
-@job
-def implnet_job_cuahsihisandrewsforestlterids0():
- harvest_cuahsihisandrewsforestlterids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py
deleted file mode 100644
index 6fddea7b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisbrazilucbids0 import harvest_cuahsihisbrazilucbids0
-
-@job
-def implnet_job_cuahsihisbrazilucbids0():
- harvest_cuahsihisbrazilucbids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py
deleted file mode 100644
index d44b94ae..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiscalvinhhsids0 import harvest_cuahsihiscalvinhhsids0
-
-@job
-def implnet_job_cuahsihiscalvinhhsids0():
- harvest_cuahsihiscalvinhhsids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py
deleted file mode 100644
index f6ff9eeb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisccbepdapids0 import harvest_cuahsihisccbepdapids0
-
-@job
-def implnet_job_cuahsihisccbepdapids0():
- harvest_cuahsihisccbepdapids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py
deleted file mode 100644
index ece85dc1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiscedarriverids0 import harvest_cuahsihiscedarriverids0
-
-@job
-def implnet_job_cuahsihiscedarriverids0():
- harvest_cuahsihiscedarriverids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py
deleted file mode 100644
index ce0d45ec..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisclarksburgspids0 import harvest_cuahsihisclarksburgspids0
-
-@job
-def implnet_job_cuahsihisclarksburgspids0():
- harvest_cuahsihisclarksburgspids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py
deleted file mode 100644
index 618cde85..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiscocorahsids0 import harvest_cuahsihiscocorahsids0
-
-@job
-def implnet_job_cuahsihiscocorahsids0():
- harvest_cuahsihiscocorahsids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py
deleted file mode 100644
index 69e6b2bd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiscrwaids0 import harvest_cuahsihiscrwaids0
-
-@job
-def implnet_job_cuahsihiscrwaids0():
- harvest_cuahsihiscrwaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py
deleted file mode 100644
index 13ef179b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiscuisoids0 import harvest_cuahsihiscuisoids0
-
-@job
-def implnet_job_cuahsihiscuisoids0():
- harvest_cuahsihiscuisoids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py
deleted file mode 100644
index e7fcc57c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczoarizids0 import harvest_cuahsihisczoarizids0
-
-@job
-def implnet_job_cuahsihisczoarizids0():
- harvest_cuahsihisczoarizids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py
deleted file mode 100644
index dba5a017..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczoboulderids0 import harvest_cuahsihisczoboulderids0
-
-@job
-def implnet_job_cuahsihisczoboulderids0():
- harvest_cuahsihisczoboulderids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py
deleted file mode 100644
index 5912966c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczocatalinaids0 import harvest_cuahsihisczocatalinaids0
-
-@job
-def implnet_job_cuahsihisczocatalinaids0():
- harvest_cuahsihisczocatalinaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py
deleted file mode 100644
index 9e87cac2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczoluquilloids0 import harvest_cuahsihisczoluquilloids0
-
-@job
-def implnet_job_cuahsihisczoluquilloids0():
- harvest_cuahsihisczoluquilloids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py
deleted file mode 100644
index 4c23dda9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczomercedids0 import harvest_cuahsihisczomercedids0
-
-@job
-def implnet_job_cuahsihisczomercedids0():
- harvest_cuahsihisczomercedids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py
deleted file mode 100644
index 56914fae..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczopsuids0 import harvest_cuahsihisczopsuids0
-
-@job
-def implnet_job_cuahsihisczopsuids0():
- harvest_cuahsihisczopsuids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py
deleted file mode 100644
index 96309a4c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisczoudelids0 import harvest_cuahsihisczoudelids0
-
-@job
-def implnet_job_cuahsihisczoudelids0():
- harvest_cuahsihisczoudelids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py
deleted file mode 100644
index 6863e32d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisdrwiids0 import harvest_cuahsihisdrwiids0
-
-@job
-def implnet_job_cuahsihisdrwiids0():
- harvest_cuahsihisdrwiids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py
deleted file mode 100644
index 97774eed..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisfarmrwaids0 import harvest_cuahsihisfarmrwaids0
-
-@job
-def implnet_job_cuahsihisfarmrwaids0():
- harvest_cuahsihisfarmrwaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py
deleted file mode 100644
index 76f0027e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisfcelterids0 import harvest_cuahsihisfcelterids0
-
-@job
-def implnet_job_cuahsihisfcelterids0():
- harvest_cuahsihisfcelterids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py
deleted file mode 100644
index 80fc98f6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisfrcwqmids0 import harvest_cuahsihisfrcwqmids0
-
-@job
-def implnet_job_cuahsihisfrcwqmids0():
- harvest_cuahsihisfrcwqmids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py
deleted file mode 100644
index 89dcb842..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisghcnids0 import harvest_cuahsihisghcnids0
-
-@job
-def implnet_job_cuahsihisghcnids0():
- harvest_cuahsihisghcnids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py
deleted file mode 100644
index 78d05082..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisglacialridgeids0 import harvest_cuahsihisglacialridgeids0
-
-@job
-def implnet_job_cuahsihisglacialridgeids0():
- harvest_cuahsihisglacialridgeids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py
deleted file mode 100644
index 9283dc5f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisgleonauburnids0 import harvest_cuahsihisgleonauburnids0
-
-@job
-def implnet_job_cuahsihisgleonauburnids0():
- harvest_cuahsihisgleonauburnids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py
deleted file mode 100644
index 94f4e2ff..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisgleondorsetids0 import harvest_cuahsihisgleondorsetids0
-
-@job
-def implnet_job_cuahsihisgleondorsetids0():
- harvest_cuahsihisgleondorsetids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py
deleted file mode 100644
index 7ab573cd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisgleonlakeannieids0 import harvest_cuahsihisgleonlakeannieids0
-
-@job
-def implnet_job_cuahsihisgleonlakeannieids0():
- harvest_cuahsihisgleonlakeannieids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py
deleted file mode 100644
index 325a6398..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisgleonsunapeeids0 import harvest_cuahsihisgleonsunapeeids0
-
-@job
-def implnet_job_cuahsihisgleonsunapeeids0():
- harvest_cuahsihisgleonsunapeeids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py
deleted file mode 100644
index 25f50f3d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisglobalriversobservatoryids0 import harvest_cuahsihisglobalriversobservatoryids0
-
-@job
-def implnet_job_cuahsihisglobalriversobservatoryids0():
- harvest_cuahsihisglobalriversobservatoryids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py
deleted file mode 100644
index 6665bd2c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisgonggaids0 import harvest_cuahsihisgonggaids0
-
-@job
-def implnet_job_cuahsihisgonggaids0():
- harvest_cuahsihisgonggaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py
deleted file mode 100644
index d04b02c3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihishassbergeids0 import harvest_cuahsihishassbergeids0
-
-@job
-def implnet_job_cuahsihishassbergeids0():
- harvest_cuahsihishassbergeids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py
deleted file mode 100644
index ee8b0f69..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihishydrodataczdids0 import harvest_cuahsihishydrodataczdids0
-
-@job
-def implnet_job_cuahsihishydrodataczdids0():
- harvest_cuahsihishydrodataczdids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py
deleted file mode 100644
index ed6554c7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihishydrodataczhrids0 import harvest_cuahsihishydrodataczhrids0
-
-@job
-def implnet_job_cuahsihishydrodataczhrids0():
- harvest_cuahsihishydrodataczhrids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py
deleted file mode 100644
index 57a2a259..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisieeratwilkesuniversityids0 import harvest_cuahsihisieeratwilkesuniversityids0
-
-@job
-def implnet_job_cuahsihisieeratwilkesuniversityids0():
- harvest_cuahsihisieeratwilkesuniversityids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py
deleted file mode 100644
index b07f9970..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisirwaids0 import harvest_cuahsihisirwaids0
-
-@job
-def implnet_job_cuahsihisirwaids0():
- harvest_cuahsihisirwaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py
deleted file mode 100644
index b282caad..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisisbenaids0 import harvest_cuahsihisisbenaids0
-
-@job
-def implnet_job_cuahsihisisbenaids0():
- harvest_cuahsihisisbenaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py
deleted file mode 100644
index 23913071..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihiskansasweatherdataids0 import harvest_cuahsihiskansasweatherdataids0
-
-@job
-def implnet_job_cuahsihiskansasweatherdataids0():
- harvest_cuahsihiskansasweatherdataids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py
deleted file mode 100644
index afb1f0b4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihislaselvastreamdischargeids0 import harvest_cuahsihislaselvastreamdischargeids0
-
-@job
-def implnet_job_cuahsihislaselvastreamdischargeids0():
- harvest_cuahsihislaselvastreamdischargeids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py
deleted file mode 100644
index 94df9db0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihislczoodm2ids0 import harvest_cuahsihislczoodm2ids0
-
-@job
-def implnet_job_cuahsihislczoodm2ids0():
- harvest_cuahsihislczoodm2ids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py
deleted file mode 100644
index 5b680e39..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihislittlebearriverids0 import harvest_cuahsihislittlebearriverids0
-
-@job
-def implnet_job_cuahsihislittlebearriverids0():
- harvest_cuahsihislittlebearriverids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py
deleted file mode 100644
index ab0eccb7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisloganrivergamutids0 import harvest_cuahsihisloganrivergamutids0
-
-@job
-def implnet_job_cuahsihisloganrivergamutids0():
- harvest_cuahsihisloganrivergamutids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py
deleted file mode 100644
index a4be3930..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisloganriverids0 import harvest_cuahsihisloganriverids0
-
-@job
-def implnet_job_cuahsihisloganriverids0():
- harvest_cuahsihisloganriverids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py
deleted file mode 100644
index 33c6b441..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihislterntlwoodruffids0 import harvest_cuahsihislterntlwoodruffids0
-
-@job
-def implnet_job_cuahsihislterntlwoodruffids0():
- harvest_cuahsihislterntlwoodruffids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py
deleted file mode 100644
index a799e703..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisluwlids0 import harvest_cuahsihisluwlids0
-
-@job
-def implnet_job_cuahsihisluwlids0():
- harvest_cuahsihisluwlids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py
deleted file mode 100644
index 0a8b5de7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismaaeriids0 import harvest_cuahsihismaaeriids0
-
-@job
-def implnet_job_cuahsihismaaeriids0():
- harvest_cuahsihismaaeriids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py
deleted file mode 100644
index ef82758b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismazarriverprojectids0 import harvest_cuahsihismazarriverprojectids0
-
-@job
-def implnet_job_cuahsihismazarriverprojectids0():
- harvest_cuahsihismazarriverprojectids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py
deleted file mode 100644
index 8730f346..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismmaatacamaids0 import harvest_cuahsihismmaatacamaids0
-
-@job
-def implnet_job_cuahsihismmaatacamaids0():
- harvest_cuahsihismmaatacamaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py
deleted file mode 100644
index 72244932..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismobilecrowdhydrologyids0 import harvest_cuahsihismobilecrowdhydrologyids0
-
-@job
-def implnet_job_cuahsihismobilecrowdhydrologyids0():
- harvest_cuahsihismobilecrowdhydrologyids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py
deleted file mode 100644
index 66dc9e0f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismopexids0 import harvest_cuahsihismopexids0
-
-@job
-def implnet_job_cuahsihismopexids0():
- harvest_cuahsihismopexids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py
deleted file mode 100644
index 9c35a512..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismuddyriverids0 import harvest_cuahsihismuddyriverids0
-
-@job
-def implnet_job_cuahsihismuddyriverids0():
- harvest_cuahsihismuddyriverids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py
deleted file mode 100644
index 1699cbef..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismudlakeids0 import harvest_cuahsihismudlakeids0
-
-@job
-def implnet_job_cuahsihismudlakeids0():
- harvest_cuahsihismudlakeids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py
deleted file mode 100644
index 37e30248..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismwdisids0 import harvest_cuahsihismwdisids0
-
-@job
-def implnet_job_cuahsihismwdisids0():
- harvest_cuahsihismwdisids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py
deleted file mode 100644
index a8d7ee25..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihismwraids0 import harvest_cuahsihismwraids0
-
-@job
-def implnet_job_cuahsihismwraids0():
- harvest_cuahsihismwraids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py
deleted file mode 100644
index e74a54c3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnashrwaids0 import harvest_cuahsihisnashrwaids0
-
-@job
-def implnet_job_cuahsihisnashrwaids0():
- harvest_cuahsihisnashrwaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py
deleted file mode 100644
index 346fe0f9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnceiww2ids0 import harvest_cuahsihisnceiww2ids0
-
-@job
-def implnet_job_cuahsihisnceiww2ids0():
- harvest_cuahsihisnceiww2ids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py
deleted file mode 100644
index 150231ef..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisneonids0 import harvest_cuahsihisneonids0
-
-@job
-def implnet_job_cuahsihisneonids0():
- harvest_cuahsihisneonids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py
deleted file mode 100644
index 589d4d1d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnevadosids0 import harvest_cuahsihisnevadosids0
-
-@job
-def implnet_job_cuahsihisnevadosids0():
- harvest_cuahsihisnevadosids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py
deleted file mode 100644
index 44b3a006..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnevcanids0 import harvest_cuahsihisnevcanids0
-
-@job
-def implnet_job_cuahsihisnevcanids0():
- harvest_cuahsihisnevcanids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py
deleted file mode 100644
index 506ea9c6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnewnids0 import harvest_cuahsihisnewnids0
-
-@job
-def implnet_job_cuahsihisnewnids0():
- harvest_cuahsihisnewnids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py
deleted file mode 100644
index 5865499d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnhgswofids0 import harvest_cuahsihisnhgswofids0
-
-@job
-def implnet_job_cuahsihisnhgswofids0():
- harvest_cuahsihisnhgswofids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py
deleted file mode 100644
index 81358e29..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisnooksackmicroclimatenetworkids0 import harvest_cuahsihisnooksackmicroclimatenetworkids0
-
-@job
-def implnet_job_cuahsihisnooksackmicroclimatenetworkids0():
- harvest_cuahsihisnooksackmicroclimatenetworkids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py
deleted file mode 100644
index 63a818e9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisodmkentstateids0 import harvest_cuahsihisodmkentstateids0
-
-@job
-def implnet_job_cuahsihisodmkentstateids0():
- harvest_cuahsihisodmkentstateids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py
deleted file mode 100644
index d2aee63d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisorsancohabids0 import harvest_cuahsihisorsancohabids0
-
-@job
-def implnet_job_cuahsihisorsancohabids0():
- harvest_cuahsihisorsancohabids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py
deleted file mode 100644
index 73672ec0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihispanolaodmids0 import harvest_cuahsihispanolaodmids0
-
-@job
-def implnet_job_cuahsihispanolaodmids0():
- harvest_cuahsihispanolaodmids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py
deleted file mode 100644
index dcfe3c65..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisparalanaturalezaids0 import harvest_cuahsihisparalanaturalezaids0
-
-@job
-def implnet_job_cuahsihisparalanaturalezaids0():
- harvest_cuahsihisparalanaturalezaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py
deleted file mode 100644
index dc931cd4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisprovorivergamutids0 import harvest_cuahsihisprovorivergamutids0
-
-@job
-def implnet_job_cuahsihisprovorivergamutids0():
- harvest_cuahsihisprovorivergamutids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py
deleted file mode 100644
index 6fbfc3df..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisredbuttecreekgamutids0 import harvest_cuahsihisredbuttecreekgamutids0
-
-@job
-def implnet_job_cuahsihisredbuttecreekgamutids0():
- harvest_cuahsihisredbuttecreekgamutids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py
deleted file mode 100644
index 21a37e3b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisrmblids0 import harvest_cuahsihisrmblids0
-
-@job
-def implnet_job_cuahsihisrmblids0():
- harvest_cuahsihisrmblids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py
deleted file mode 100644
index 4f71a81a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihissagehencreekids0 import harvest_cuahsihissagehencreekids0
-
-@job
-def implnet_job_cuahsihissagehencreekids0():
- harvest_cuahsihissagehencreekids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py
deleted file mode 100644
index bef484cc..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisscanids0 import harvest_cuahsihisscanids0
-
-@job
-def implnet_job_cuahsihisscanids0():
- harvest_cuahsihisscanids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py
deleted file mode 100644
index 9f5605ef..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisscotlandnwisids0 import harvest_cuahsihisscotlandnwisids0
-
-@job
-def implnet_job_cuahsihisscotlandnwisids0():
- harvest_cuahsihisscotlandnwisids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py
deleted file mode 100644
index aa868715..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisshalenetworkodmids0 import harvest_cuahsihisshalenetworkodmids0
-
-@job
-def implnet_job_cuahsihisshalenetworkodmids0():
- harvest_cuahsihisshalenetworkodmids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py
deleted file mode 100644
index 30acd4bb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisshalenetworkodmids1 import harvest_cuahsihisshalenetworkodmids1
-
-@job
-def implnet_job_cuahsihisshalenetworkodmids1():
- harvest_cuahsihisshalenetworkodmids1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py
deleted file mode 100644
index 28d62e67..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisskcmilltownids0 import harvest_cuahsihisskcmilltownids0
-
-@job
-def implnet_job_cuahsihisskcmilltownids0():
- harvest_cuahsihisskcmilltownids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py
deleted file mode 100644
index fc5fa17f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihissnotelids0 import harvest_cuahsihissnotelids0
-
-@job
-def implnet_job_cuahsihissnotelids0():
- harvest_cuahsihissnotelids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py
deleted file mode 100644
index 919c7165..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisswedishmonitoringdataids0 import harvest_cuahsihisswedishmonitoringdataids0
-
-@job
-def implnet_job_cuahsihisswedishmonitoringdataids0():
- harvest_cuahsihisswedishmonitoringdataids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py
deleted file mode 100644
index c3d808fd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihistarlandwaterqualityids0 import harvest_cuahsihistarlandwaterqualityids0
-
-@job
-def implnet_job_cuahsihistarlandwaterqualityids0():
- harvest_cuahsihistarlandwaterqualityids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py
deleted file mode 100644
index c0e44f7d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihistncwaterdataids0 import harvest_cuahsihistncwaterdataids0
-
-@job
-def implnet_job_cuahsihistncwaterdataids0():
- harvest_cuahsihistncwaterdataids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py
deleted file mode 100644
index 4989a99f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihistrwaids0 import harvest_cuahsihistrwaids0
-
-@job
-def implnet_job_cuahsihistrwaids0():
- harvest_cuahsihistrwaids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py
deleted file mode 100644
index 91e6881d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihistuolumnemdwids0 import harvest_cuahsihistuolumnemdwids0
-
-@job
-def implnet_job_cuahsihistuolumnemdwids0():
- harvest_cuahsihistuolumnemdwids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py
deleted file mode 100644
index 0d786be8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisubwpadids0 import harvest_cuahsihisubwpadids0
-
-@job
-def implnet_job_cuahsihisubwpadids0():
- harvest_cuahsihisubwpadids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py
deleted file mode 100644
index 7e4113c8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisumbcgwids0 import harvest_cuahsihisumbcgwids0
-
-@job
-def implnet_job_cuahsihisumbcgwids0():
- harvest_cuahsihisumbcgwids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py
deleted file mode 100644
index 848f6cbd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisumbcwqids0 import harvest_cuahsihisumbcwqids0
-
-@job
-def implnet_job_cuahsihisumbcwqids0():
- harvest_cuahsihisumbcwqids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py
deleted file mode 100644
index 4a2c71f6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisunhsnowids0 import harvest_cuahsihisunhsnowids0
-
-@job
-def implnet_job_cuahsihisunhsnowids0():
- harvest_cuahsihisunhsnowids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py
deleted file mode 100644
index 364a09d1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisweiherbachids0 import harvest_cuahsihisweiherbachids0
-
-@job
-def implnet_job_cuahsihisweiherbachids0():
- harvest_cuahsihisweiherbachids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py
deleted file mode 100644
index f21cfcd0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuahsihisyosemitehydroclimatenetworkids0 import harvest_cuahsihisyosemitehydroclimatenetworkids0
-
-@job
-def implnet_job_cuahsihisyosemitehydroclimatenetworkids0():
- harvest_cuahsihisyosemitehydroclimatenetworkids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py
deleted file mode 100644
index bf92c53e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_dams0 import harvest_dams0
-
-@job
-def implnet_job_dams0():
- harvest_dams0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py
deleted file mode 100644
index a675c6a0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_dams1 import harvest_dams1
-
-@job
-def implnet_job_dams1():
- harvest_dams1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py
deleted file mode 100644
index f0fed16e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_damspids0 import harvest_damspids0
-
-@job
-def implnet_job_damspids0():
- harvest_damspids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py
deleted file mode 100644
index 51eb0266..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_demo0 import harvest_demo0
-
-@job
-def implnet_job_demo0():
- harvest_demo0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py
deleted file mode 100644
index a291992d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_gfv11pois0 import harvest_gfv11pois0
-
-@job
-def implnet_job_gfv11pois0():
- harvest_gfv11pois0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py
deleted file mode 100644
index cba3692f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_gfv11pois1 import harvest_gfv11pois1
-
-@job
-def implnet_job_gfv11pois1():
- harvest_gfv11pois1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py
deleted file mode 100644
index 29d53172..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hmw0 import harvest_hmw0
-
-@job
-def implnet_job_hmw0():
- harvest_hmw0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py
deleted file mode 100644
index f9d1ad68..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hmw1 import harvest_hmw1
-
-@job
-def implnet_job_hmw1():
- harvest_hmw1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py
deleted file mode 100644
index fcde012d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hu020 import harvest_hu020
-
-@job
-def implnet_job_hu020():
- harvest_hu020()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py
deleted file mode 100644
index 7776e00a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hu040 import harvest_hu040
-
-@job
-def implnet_job_hu040():
- harvest_hu040()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py
deleted file mode 100644
index fe4a44a3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hu060 import harvest_hu060
-
-@job
-def implnet_job_hu060():
- harvest_hu060()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py
deleted file mode 100644
index 412a1b3b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hu080 import harvest_hu080
-
-@job
-def implnet_job_hu080():
- harvest_hu080()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py
deleted file mode 100644
index fa812d46..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hu100 import harvest_hu100
-
-@job
-def implnet_job_hu100():
- harvest_hu100()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py
deleted file mode 100644
index e498fcad..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_huc12pp0 import harvest_huc12pp0
-
-@job
-def implnet_job_huc12pp0():
- harvest_huc12pp0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py
deleted file mode 100644
index 47225877..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_huc12pp1 import harvest_huc12pp1
-
-@job
-def implnet_job_huc12pp1():
- harvest_huc12pp1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py
deleted file mode 100644
index 626b412d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hydrologicunit0 import harvest_hydrologicunit0
-
-@job
-def implnet_job_hydrologicunit0():
- harvest_hydrologicunit0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py
deleted file mode 100644
index 90350f96..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_links0 import harvest_links0
-
-@job
-def implnet_job_links0():
- harvest_links0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py
deleted file mode 100644
index 3e6ab4c4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_mainstems0 import harvest_mainstems0
-
-@job
-def implnet_job_mainstems0():
- harvest_mainstems0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py
deleted file mode 100644
index fcac3e9c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nataq0 import harvest_nataq0
-
-@job
-def implnet_job_nataq0():
- harvest_nataq0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py
deleted file mode 100644
index c1ab5b1f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdiose0 import harvest_nmwdiose0
-
-@job
-def implnet_job_nmwdiose0():
- harvest_nmwdiose0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py
deleted file mode 100644
index 736fdf78..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdiose1 import harvest_nmwdiose1
-
-@job
-def implnet_job_nmwdiose1():
- harvest_nmwdiose1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py
deleted file mode 100644
index f2ec56e3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdiose2 import harvest_nmwdiose2
-
-@job
-def implnet_job_nmwdiose2():
- harvest_nmwdiose2()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py
deleted file mode 100644
index f7d1aefd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdiose3 import harvest_nmwdiose3
-
-@job
-def implnet_job_nmwdiose3():
- harvest_nmwdiose3()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py
deleted file mode 100644
index dd1ab409..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdiose4 import harvest_nmwdiose4
-
-@job
-def implnet_job_nmwdiose4():
- harvest_nmwdiose4()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py
deleted file mode 100644
index 5a3fb84b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nmwdist0 import harvest_nmwdist0
-
-@job
-def implnet_job_nmwdist0():
- harvest_nmwdist0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py
deleted file mode 100644
index 257aebff..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw0 import harvest_nwisgw0
-
-@job
-def implnet_job_nwisgw0():
- harvest_nwisgw0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py
deleted file mode 100644
index f6af1ef2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw1 import harvest_nwisgw1
-
-@job
-def implnet_job_nwisgw1():
- harvest_nwisgw1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py
deleted file mode 100644
index d3443419..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw10 import harvest_nwisgw10
-
-@job
-def implnet_job_nwisgw10():
- harvest_nwisgw10()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py
deleted file mode 100644
index e294a0df..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw11 import harvest_nwisgw11
-
-@job
-def implnet_job_nwisgw11():
- harvest_nwisgw11()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py
deleted file mode 100644
index 2c0cf870..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw12 import harvest_nwisgw12
-
-@job
-def implnet_job_nwisgw12():
- harvest_nwisgw12()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py
deleted file mode 100644
index 02dd0892..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw13 import harvest_nwisgw13
-
-@job
-def implnet_job_nwisgw13():
- harvest_nwisgw13()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py
deleted file mode 100644
index e0123111..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw14 import harvest_nwisgw14
-
-@job
-def implnet_job_nwisgw14():
- harvest_nwisgw14()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py
deleted file mode 100644
index 04d121eb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw15 import harvest_nwisgw15
-
-@job
-def implnet_job_nwisgw15():
- harvest_nwisgw15()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py
deleted file mode 100644
index 2aafac7d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw16 import harvest_nwisgw16
-
-@job
-def implnet_job_nwisgw16():
- harvest_nwisgw16()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py
deleted file mode 100644
index 12533de5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw17 import harvest_nwisgw17
-
-@job
-def implnet_job_nwisgw17():
- harvest_nwisgw17()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py
deleted file mode 100644
index 5e04df4f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw18 import harvest_nwisgw18
-
-@job
-def implnet_job_nwisgw18():
- harvest_nwisgw18()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py
deleted file mode 100644
index 5481ee6b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw19 import harvest_nwisgw19
-
-@job
-def implnet_job_nwisgw19():
- harvest_nwisgw19()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py
deleted file mode 100644
index 107cbaf4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw2 import harvest_nwisgw2
-
-@job
-def implnet_job_nwisgw2():
- harvest_nwisgw2()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py
deleted file mode 100644
index fc963f46..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw20 import harvest_nwisgw20
-
-@job
-def implnet_job_nwisgw20():
- harvest_nwisgw20()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py
deleted file mode 100644
index 8dcdfeb4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw21 import harvest_nwisgw21
-
-@job
-def implnet_job_nwisgw21():
- harvest_nwisgw21()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py
deleted file mode 100644
index 6688765c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw22 import harvest_nwisgw22
-
-@job
-def implnet_job_nwisgw22():
- harvest_nwisgw22()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py
deleted file mode 100644
index 235ea9a2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw23 import harvest_nwisgw23
-
-@job
-def implnet_job_nwisgw23():
- harvest_nwisgw23()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py
deleted file mode 100644
index 4629fa91..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw24 import harvest_nwisgw24
-
-@job
-def implnet_job_nwisgw24():
- harvest_nwisgw24()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py
deleted file mode 100644
index dafa0ec6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw25 import harvest_nwisgw25
-
-@job
-def implnet_job_nwisgw25():
- harvest_nwisgw25()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py
deleted file mode 100644
index 4492f72f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw26 import harvest_nwisgw26
-
-@job
-def implnet_job_nwisgw26():
- harvest_nwisgw26()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py
deleted file mode 100644
index 84049cfc..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw27 import harvest_nwisgw27
-
-@job
-def implnet_job_nwisgw27():
- harvest_nwisgw27()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py
deleted file mode 100644
index e2ea0f05..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw28 import harvest_nwisgw28
-
-@job
-def implnet_job_nwisgw28():
- harvest_nwisgw28()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py
deleted file mode 100644
index 416f80a6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw3 import harvest_nwisgw3
-
-@job
-def implnet_job_nwisgw3():
- harvest_nwisgw3()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py
deleted file mode 100644
index 95ec7076..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw4 import harvest_nwisgw4
-
-@job
-def implnet_job_nwisgw4():
- harvest_nwisgw4()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py
deleted file mode 100644
index c84cdeae..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw5 import harvest_nwisgw5
-
-@job
-def implnet_job_nwisgw5():
- harvest_nwisgw5()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py
deleted file mode 100644
index d9c3d1db..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw6 import harvest_nwisgw6
-
-@job
-def implnet_job_nwisgw6():
- harvest_nwisgw6()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py
deleted file mode 100644
index 2a77ef8e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw7 import harvest_nwisgw7
-
-@job
-def implnet_job_nwisgw7():
- harvest_nwisgw7()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py
deleted file mode 100644
index 8f8741bd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw8 import harvest_nwisgw8
-
-@job
-def implnet_job_nwisgw8():
- harvest_nwisgw8()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py
deleted file mode 100644
index 4cd8710a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwisgw9 import harvest_nwisgw9
-
-@job
-def implnet_job_nwisgw9():
- harvest_nwisgw9()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py
deleted file mode 100644
index 8401b1e0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwissite0 import harvest_nwissite0
-
-@job
-def implnet_job_nwissite0():
- harvest_nwissite0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py
deleted file mode 100644
index 78009468..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwissite1 import harvest_nwissite1
-
-@job
-def implnet_job_nwissite1():
- harvest_nwissite1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py
deleted file mode 100644
index 8e7c759c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwissite2 import harvest_nwissite2
-
-@job
-def implnet_job_nwissite2():
- harvest_nwissite2()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py
deleted file mode 100644
index 3106e4b7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nwissite3 import harvest_nwissite3
-
-@job
-def implnet_job_nwissite3():
- harvest_nwissite3()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py
deleted file mode 100644
index 865efac7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_places0 import harvest_places0
-
-@job
-def implnet_job_places0():
- harvest_places0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py
deleted file mode 100644
index 3dda69ff..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_princiaq0 import harvest_princiaq0
-
-@job
-def implnet_job_princiaq0():
- harvest_princiaq0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py
deleted file mode 100644
index 4126696f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_pws0 import harvest_pws0
-
-@job
-def implnet_job_pws0():
- harvest_pws0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py
deleted file mode 100644
index 46607415..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_refgage0 import harvest_refgage0
-
-@job
-def implnet_job_refgage0():
- harvest_refgage0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py
deleted file mode 100644
index 2dcdcab4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_refgage1 import harvest_refgage1
-
-@job
-def implnet_job_refgage1():
- harvest_refgage1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py
deleted file mode 100644
index b9bc1a4c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_refgage2 import harvest_refgage2
-
-@job
-def implnet_job_refgage2():
- harvest_refgage2()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py
deleted file mode 100644
index 870cc1f2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_refgage3 import harvest_refgage3
-
-@job
-def implnet_job_refgage3():
- harvest_refgage3()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py
deleted file mode 100644
index 7f5dd685..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_rise0 import harvest_rise0
-
-@job
-def implnet_job_rise0():
- harvest_rise0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py
deleted file mode 100644
index 6747ad00..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_sechydrgreg0 import harvest_sechydrgreg0
-
-@job
-def implnet_job_sechydrgreg0():
- harvest_sechydrgreg0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py
deleted file mode 100644
index b4a6da97..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_selfieids0 import harvest_selfieids0
-
-@job
-def implnet_job_selfieids0():
- harvest_selfieids0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py
deleted file mode 100644
index e40f041a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_states0 import harvest_states0
-
-@job
-def implnet_job_states0():
- harvest_states0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py
deleted file mode 100644
index 159f619e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ua100 import harvest_ua100
-
-@job
-def implnet_job_ua100():
- harvest_ua100()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py
deleted file mode 100644
index d188b950..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade0 import harvest_wade0
-
-@job
-def implnet_job_wade0():
- harvest_wade0()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py
deleted file mode 100644
index 725a3a0a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade1 import harvest_wade1
-
-@job
-def implnet_job_wade1():
- harvest_wade1()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py
deleted file mode 100644
index fee08238..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade10 import harvest_wade10
-
-@job
-def implnet_job_wade10():
- harvest_wade10()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py
deleted file mode 100644
index ba521353..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade11 import harvest_wade11
-
-@job
-def implnet_job_wade11():
- harvest_wade11()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py
deleted file mode 100644
index bc4e6108..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade12 import harvest_wade12
-
-@job
-def implnet_job_wade12():
- harvest_wade12()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py
deleted file mode 100644
index 291f8c66..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade13 import harvest_wade13
-
-@job
-def implnet_job_wade13():
- harvest_wade13()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py
deleted file mode 100644
index 955c68e6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade14 import harvest_wade14
-
-@job
-def implnet_job_wade14():
- harvest_wade14()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py
deleted file mode 100644
index 5e982edf..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade15 import harvest_wade15
-
-@job
-def implnet_job_wade15():
- harvest_wade15()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py
deleted file mode 100644
index 2f54ea43..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade16 import harvest_wade16
-
-@job
-def implnet_job_wade16():
- harvest_wade16()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py
deleted file mode 100644
index 18c747f2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade17 import harvest_wade17
-
-@job
-def implnet_job_wade17():
- harvest_wade17()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py
deleted file mode 100644
index 5bd5ce6f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade18 import harvest_wade18
-
-@job
-def implnet_job_wade18():
- harvest_wade18()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py
deleted file mode 100644
index 1641d82a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade19 import harvest_wade19
-
-@job
-def implnet_job_wade19():
- harvest_wade19()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py
deleted file mode 100644
index 0fda70a7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade2 import harvest_wade2
-
-@job
-def implnet_job_wade2():
- harvest_wade2()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py
deleted file mode 100644
index bbb207e3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade3 import harvest_wade3
-
-@job
-def implnet_job_wade3():
- harvest_wade3()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py
deleted file mode 100644
index 63e92ddb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade4 import harvest_wade4
-
-@job
-def implnet_job_wade4():
- harvest_wade4()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py
deleted file mode 100644
index 74c0c863..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade5 import harvest_wade5
-
-@job
-def implnet_job_wade5():
- harvest_wade5()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py
deleted file mode 100644
index 44732ec2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade6 import harvest_wade6
-
-@job
-def implnet_job_wade6():
- harvest_wade6()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py
deleted file mode 100644
index 2825e5da..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade7 import harvest_wade7
-
-@job
-def implnet_job_wade7():
- harvest_wade7()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py
deleted file mode 100644
index ec6a046e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade8 import harvest_wade8
-
-@job
-def implnet_job_wade8():
- harvest_wade8()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py
deleted file mode 100644
index ed099b32..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wade9 import harvest_wade9
-
-@job
-def implnet_job_wade9():
- harvest_wade9()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py
deleted file mode 100644
index 8ef7b64a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def aiannh0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def aiannh0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_naburelease(context):
- returned_value = gleanerio(context,("release"), "aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aiannh0_uploadrelease(context):
- returned_value = postRelease("aiannh0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aiannh0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aiannh0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aiannh0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def aiannh0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aiannh0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aiannh0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def aiannh0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "aiannh0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="aiannh0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="aiannh0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_aiannh0():
- containers = aiannh0_getImage()
- harvest = aiannh0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = aiannh0_missingreport_s3(start=harvest)
- report_idstat = aiannh0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = aiannh0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="aiannh0")
- load_release = aiannh0_naburelease(start=harvest)
- load_uploadrelease = aiannh0_uploadrelease(start=load_release)
-
- load_prune = aiannh0_nabu_prune(start=load_uploadrelease)
- load_prov = aiannh0_nabuprov(start=load_prune)
- load_org = aiannh0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=aiannh0_missingreport_graph(start=load_org)
- report_graph=aiannh0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py
deleted file mode 100644
index 6d0e34d6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def autotest10_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def autotest10_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_naburelease(context):
- returned_value = gleanerio(context,("release"), "autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest10_uploadrelease(context):
- returned_value = postRelease("autotest10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def autotest10_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest10"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest10_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest10"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest10_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest10"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest10"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest10_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest10"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="autotest10"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="autotest10"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_autotest10():
- containers = autotest10_getImage()
- harvest = autotest10_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = autotest10_missingreport_s3(start=harvest)
- report_idstat = autotest10_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = autotest10_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="autotest10")
- load_release = autotest10_naburelease(start=harvest)
- load_uploadrelease = autotest10_uploadrelease(start=load_release)
-
- load_prune = autotest10_nabu_prune(start=load_uploadrelease)
- load_prov = autotest10_nabuprov(start=load_prune)
- load_org = autotest10_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=autotest10_missingreport_graph(start=load_org)
- report_graph=autotest10_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py
deleted file mode 100644
index 52b05a70..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def autotest20_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def autotest20_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_naburelease(context):
- returned_value = gleanerio(context,("release"), "autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest20_uploadrelease(context):
- returned_value = postRelease("autotest20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def autotest20_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest20"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest20_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest20"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def autotest20_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest20"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest20"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def autotest20_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "autotest20"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="autotest20"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="autotest20"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_autotest20():
- containers = autotest20_getImage()
- harvest = autotest20_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = autotest20_missingreport_s3(start=harvest)
- report_idstat = autotest20_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = autotest20_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="autotest20")
- load_release = autotest20_naburelease(start=harvest)
- load_uploadrelease = autotest20_uploadrelease(start=load_release)
-
- load_prune = autotest20_nabu_prune(start=load_uploadrelease)
- load_prov = autotest20_nabuprov(start=load_prune)
- load_org = autotest20_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=autotest20_missingreport_graph(start=load_org)
- report_graph=autotest20_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py
deleted file mode 100644
index e8b52d7f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cagagespids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cagagespids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cagagespids0_uploadrelease(context):
- returned_value = postRelease("cagagespids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cagagespids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cagagespids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cagagespids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cagagespids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cagagespids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cagagespids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cagagespids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cagagespids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cagagespids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cagagespids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cagagespids0():
- containers = cagagespids0_getImage()
- harvest = cagagespids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cagagespids0_missingreport_s3(start=harvest)
- report_idstat = cagagespids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cagagespids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cagagespids0")
- load_release = cagagespids0_naburelease(start=harvest)
- load_uploadrelease = cagagespids0_uploadrelease(start=load_release)
-
- load_prune = cagagespids0_nabu_prune(start=load_uploadrelease)
- load_prov = cagagespids0_nabuprov(start=load_prune)
- load_org = cagagespids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cagagespids0_missingreport_graph(start=load_org)
- report_graph=cagagespids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py
deleted file mode 100644
index fd71d2dc..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cbsa0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cbsa0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cbsa0_uploadrelease(context):
- returned_value = postRelease("cbsa0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cbsa0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cbsa0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cbsa0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cbsa0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cbsa0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cbsa0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cbsa0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cbsa0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cbsa0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cbsa0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cbsa0():
- containers = cbsa0_getImage()
- harvest = cbsa0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cbsa0_missingreport_s3(start=harvest)
- report_idstat = cbsa0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cbsa0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cbsa0")
- load_release = cbsa0_naburelease(start=harvest)
- load_uploadrelease = cbsa0_uploadrelease(start=load_release)
-
- load_prune = cbsa0_nabu_prune(start=load_uploadrelease)
- load_prov = cbsa0_nabuprov(start=load_prune)
- load_org = cbsa0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cbsa0_missingreport_graph(start=load_org)
- report_graph=cbsa0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py
deleted file mode 100644
index b22c1575..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def chyldpilotids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_uploadrelease(context):
- returned_value = postRelease("chyldpilotids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "chyldpilotids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "chyldpilotids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "chyldpilotids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "chyldpilotids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def chyldpilotids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "chyldpilotids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="chyldpilotids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="chyldpilotids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_chyldpilotids0():
- containers = chyldpilotids0_getImage()
- harvest = chyldpilotids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = chyldpilotids0_missingreport_s3(start=harvest)
- report_idstat = chyldpilotids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = chyldpilotids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="chyldpilotids0")
- load_release = chyldpilotids0_naburelease(start=harvest)
- load_uploadrelease = chyldpilotids0_uploadrelease(start=load_release)
-
- load_prune = chyldpilotids0_nabu_prune(start=load_uploadrelease)
- load_prov = chyldpilotids0_nabuprov(start=load_prune)
- load_org = chyldpilotids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=chyldpilotids0_missingreport_graph(start=load_org)
- report_graph=chyldpilotids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py
deleted file mode 100644
index af125a85..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I think this happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def counties0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def counties0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_naburelease(context):
- returned_value = gleanerio(context,("release"), "counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def counties0_uploadrelease(context):
- returned_value = postRelease("counties0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def counties0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "counties0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def counties0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "counties0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def counties0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "counties0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "counties0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def counties0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "counties0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="counties0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="counties0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_counties0():
- containers = counties0_getImage()
- harvest = counties0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = counties0_missingreport_s3(start=harvest)
- report_idstat = counties0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = counties0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="counties0")
- load_release = counties0_naburelease(start=harvest)
- load_uploadrelease = counties0_uploadrelease(start=load_release)
-
- load_prune = counties0_nabu_prune(start=load_uploadrelease)
- load_prov = counties0_nabuprov(start=load_prune)
- load_org = counties0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=counties0_missingreport_graph(start=load_org)
- report_graph=counties0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py
deleted file mode 100644
index c64c8a7a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisandrewsforestlterids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisandrewsforestlterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisandrewsforestlterids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisandrewsforestlterids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisandrewsforestlterids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisandrewsforestlterids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisandrewsforestlterids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisandrewsforestlterids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisandrewsforestlterids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisandrewsforestlterids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisandrewsforestlterids0():
- containers = cuahsihisandrewsforestlterids0_getImage()
- harvest = cuahsihisandrewsforestlterids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisandrewsforestlterids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisandrewsforestlterids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisandrewsforestlterids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisandrewsforestlterids0")
- load_release = cuahsihisandrewsforestlterids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisandrewsforestlterids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisandrewsforestlterids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisandrewsforestlterids0_nabuprov(start=load_prune)
- load_org = cuahsihisandrewsforestlterids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisandrewsforestlterids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisandrewsforestlterids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py
deleted file mode 100644
index 29a7b92e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisbrazilucbids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisbrazilucbids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisbrazilucbids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisbrazilucbids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisbrazilucbids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisbrazilucbids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisbrazilucbids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisbrazilucbids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisbrazilucbids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisbrazilucbids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisbrazilucbids0():
- containers = cuahsihisbrazilucbids0_getImage()
- harvest = cuahsihisbrazilucbids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisbrazilucbids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisbrazilucbids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisbrazilucbids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisbrazilucbids0")
- load_release = cuahsihisbrazilucbids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisbrazilucbids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisbrazilucbids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisbrazilucbids0_nabuprov(start=load_prune)
- load_org = cuahsihisbrazilucbids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisbrazilucbids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisbrazilucbids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py
deleted file mode 100644
index 338a9356..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiscalvinhhsids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiscalvinhhsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscalvinhhsids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscalvinhhsids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscalvinhhsids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscalvinhhsids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscalvinhhsids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscalvinhhsids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiscalvinhhsids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiscalvinhhsids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiscalvinhhsids0():
- containers = cuahsihiscalvinhhsids0_getImage()
- harvest = cuahsihiscalvinhhsids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiscalvinhhsids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiscalvinhhsids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiscalvinhhsids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiscalvinhhsids0")
- load_release = cuahsihiscalvinhhsids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiscalvinhhsids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiscalvinhhsids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiscalvinhhsids0_nabuprov(start=load_prune)
- load_org = cuahsihiscalvinhhsids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiscalvinhhsids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiscalvinhhsids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py
deleted file mode 100644
index 97fbc12b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisccbepdapids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisccbepdapids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisccbepdapids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisccbepdapids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisccbepdapids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisccbepdapids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisccbepdapids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisccbepdapids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisccbepdapids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisccbepdapids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisccbepdapids0():
- containers = cuahsihisccbepdapids0_getImage()
- harvest = cuahsihisccbepdapids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisccbepdapids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisccbepdapids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisccbepdapids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisccbepdapids0")
- load_release = cuahsihisccbepdapids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisccbepdapids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisccbepdapids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisccbepdapids0_nabuprov(start=load_prune)
- load_org = cuahsihisccbepdapids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisccbepdapids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisccbepdapids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py
deleted file mode 100644
index 1fc61da6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiscedarriverids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiscedarriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscedarriverids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscedarriverids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscedarriverids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscedarriverids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscedarriverids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscedarriverids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiscedarriverids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiscedarriverids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiscedarriverids0():
- containers = cuahsihiscedarriverids0_getImage()
- harvest = cuahsihiscedarriverids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiscedarriverids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiscedarriverids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiscedarriverids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiscedarriverids0")
- load_release = cuahsihiscedarriverids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiscedarriverids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiscedarriverids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiscedarriverids0_nabuprov(start=load_prune)
- load_org = cuahsihiscedarriverids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiscedarriverids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiscedarriverids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py
deleted file mode 100644
index 4460f887..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisclarksburgspids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisclarksburgspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisclarksburgspids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisclarksburgspids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisclarksburgspids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisclarksburgspids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisclarksburgspids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisclarksburgspids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisclarksburgspids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisclarksburgspids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisclarksburgspids0():
- containers = cuahsihisclarksburgspids0_getImage()
- harvest = cuahsihisclarksburgspids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisclarksburgspids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisclarksburgspids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisclarksburgspids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisclarksburgspids0")
- load_release = cuahsihisclarksburgspids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisclarksburgspids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisclarksburgspids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisclarksburgspids0_nabuprov(start=load_prune)
- load_org = cuahsihisclarksburgspids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisclarksburgspids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisclarksburgspids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py
deleted file mode 100644
index de371636..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiscocorahsids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiscocorahsids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscocorahsids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscocorahsids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscocorahsids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscocorahsids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscocorahsids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscocorahsids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiscocorahsids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiscocorahsids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiscocorahsids0():
- containers = cuahsihiscocorahsids0_getImage()
- harvest = cuahsihiscocorahsids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiscocorahsids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiscocorahsids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiscocorahsids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiscocorahsids0")
- load_release = cuahsihiscocorahsids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiscocorahsids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiscocorahsids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiscocorahsids0_nabuprov(start=load_prune)
- load_org = cuahsihiscocorahsids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiscocorahsids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiscocorahsids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py
deleted file mode 100644
index c99b9036..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiscrwaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiscrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscrwaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscrwaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscrwaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscrwaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscrwaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscrwaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiscrwaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiscrwaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiscrwaids0():
- containers = cuahsihiscrwaids0_getImage()
- harvest = cuahsihiscrwaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiscrwaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiscrwaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiscrwaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiscrwaids0")
- load_release = cuahsihiscrwaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiscrwaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiscrwaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiscrwaids0_nabuprov(start=load_prune)
- load_org = cuahsihiscrwaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiscrwaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiscrwaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py
deleted file mode 100644
index 44eee490..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiscuisoids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiscuisoids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscuisoids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscuisoids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscuisoids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscuisoids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiscuisoids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiscuisoids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiscuisoids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiscuisoids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiscuisoids0():
- containers = cuahsihiscuisoids0_getImage()
- harvest = cuahsihiscuisoids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiscuisoids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiscuisoids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiscuisoids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiscuisoids0")
- load_release = cuahsihiscuisoids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiscuisoids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiscuisoids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiscuisoids0_nabuprov(start=load_prune)
- load_org = cuahsihiscuisoids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiscuisoids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiscuisoids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py
deleted file mode 100644
index aab39321..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczoarizids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczoarizids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoarizids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoarizids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoarizids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoarizids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoarizids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoarizids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczoarizids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczoarizids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczoarizids0():
- containers = cuahsihisczoarizids0_getImage()
- harvest = cuahsihisczoarizids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczoarizids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczoarizids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczoarizids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczoarizids0")
- load_release = cuahsihisczoarizids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczoarizids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczoarizids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczoarizids0_nabuprov(start=load_prune)
- load_org = cuahsihisczoarizids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczoarizids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczoarizids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py
deleted file mode 100644
index 30b03230..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczoboulderids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczoboulderids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoboulderids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoboulderids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoboulderids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoboulderids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoboulderids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoboulderids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczoboulderids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczoboulderids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczoboulderids0():
- containers = cuahsihisczoboulderids0_getImage()
- harvest = cuahsihisczoboulderids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczoboulderids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczoboulderids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczoboulderids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczoboulderids0")
- load_release = cuahsihisczoboulderids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczoboulderids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczoboulderids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczoboulderids0_nabuprov(start=load_prune)
- load_org = cuahsihisczoboulderids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczoboulderids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczoboulderids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py
deleted file mode 100644
index 584b550d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczocatalinaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczocatalinaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczocatalinaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczocatalinaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczocatalinaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczocatalinaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczocatalinaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczocatalinaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczocatalinaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczocatalinaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczocatalinaids0():
- containers = cuahsihisczocatalinaids0_getImage()
- harvest = cuahsihisczocatalinaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczocatalinaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczocatalinaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczocatalinaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczocatalinaids0")
- load_release = cuahsihisczocatalinaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczocatalinaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczocatalinaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczocatalinaids0_nabuprov(start=load_prune)
- load_org = cuahsihisczocatalinaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczocatalinaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczocatalinaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py
deleted file mode 100644
index efc746e3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczoluquilloids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczoluquilloids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoluquilloids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoluquilloids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoluquilloids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoluquilloids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoluquilloids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoluquilloids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczoluquilloids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczoluquilloids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczoluquilloids0():
- containers = cuahsihisczoluquilloids0_getImage()
- harvest = cuahsihisczoluquilloids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczoluquilloids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczoluquilloids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczoluquilloids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczoluquilloids0")
- load_release = cuahsihisczoluquilloids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczoluquilloids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczoluquilloids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczoluquilloids0_nabuprov(start=load_prune)
- load_org = cuahsihisczoluquilloids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczoluquilloids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczoluquilloids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py
deleted file mode 100644
index 7246c604..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczomercedids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczomercedids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczomercedids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczomercedids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczomercedids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczomercedids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczomercedids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczomercedids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczomercedids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczomercedids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczomercedids0():
- containers = cuahsihisczomercedids0_getImage()
- harvest = cuahsihisczomercedids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczomercedids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczomercedids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczomercedids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczomercedids0")
- load_release = cuahsihisczomercedids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczomercedids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczomercedids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczomercedids0_nabuprov(start=load_prune)
- load_org = cuahsihisczomercedids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczomercedids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczomercedids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py
deleted file mode 100644
index 7d419530..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczopsuids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczopsuids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczopsuids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczopsuids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczopsuids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczopsuids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczopsuids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczopsuids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczopsuids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczopsuids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczopsuids0():
- containers = cuahsihisczopsuids0_getImage()
- harvest = cuahsihisczopsuids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczopsuids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczopsuids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczopsuids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczopsuids0")
- load_release = cuahsihisczopsuids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczopsuids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczopsuids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczopsuids0_nabuprov(start=load_prune)
- load_org = cuahsihisczopsuids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczopsuids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczopsuids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py
deleted file mode 100644
index b9ee4947..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisczoudelids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisczoudelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoudelids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoudelids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoudelids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoudelids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisczoudelids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisczoudelids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisczoudelids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisczoudelids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisczoudelids0():
- containers = cuahsihisczoudelids0_getImage()
- harvest = cuahsihisczoudelids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisczoudelids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisczoudelids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisczoudelids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisczoudelids0")
- load_release = cuahsihisczoudelids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisczoudelids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisczoudelids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisczoudelids0_nabuprov(start=load_prune)
- load_org = cuahsihisczoudelids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisczoudelids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisczoudelids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py
deleted file mode 100644
index 7813edac..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisdrwiids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisdrwiids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisdrwiids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisdrwiids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisdrwiids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisdrwiids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisdrwiids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisdrwiids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisdrwiids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisdrwiids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisdrwiids0():
- containers = cuahsihisdrwiids0_getImage()
- harvest = cuahsihisdrwiids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisdrwiids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisdrwiids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisdrwiids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisdrwiids0")
- load_release = cuahsihisdrwiids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisdrwiids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisdrwiids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisdrwiids0_nabuprov(start=load_prune)
- load_org = cuahsihisdrwiids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisdrwiids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisdrwiids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py
deleted file mode 100644
index 920612b5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisfarmrwaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisfarmrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfarmrwaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfarmrwaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfarmrwaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfarmrwaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfarmrwaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfarmrwaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisfarmrwaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisfarmrwaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisfarmrwaids0():
- containers = cuahsihisfarmrwaids0_getImage()
- harvest = cuahsihisfarmrwaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisfarmrwaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisfarmrwaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisfarmrwaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisfarmrwaids0")
- load_release = cuahsihisfarmrwaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisfarmrwaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisfarmrwaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisfarmrwaids0_nabuprov(start=load_prune)
- load_org = cuahsihisfarmrwaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisfarmrwaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisfarmrwaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py
deleted file mode 100644
index c192b4e5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisfcelterids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisfcelterids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfcelterids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfcelterids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfcelterids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfcelterids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfcelterids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfcelterids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisfcelterids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisfcelterids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisfcelterids0():
- containers = cuahsihisfcelterids0_getImage()
- harvest = cuahsihisfcelterids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisfcelterids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisfcelterids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisfcelterids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisfcelterids0")
- load_release = cuahsihisfcelterids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisfcelterids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisfcelterids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisfcelterids0_nabuprov(start=load_prune)
- load_org = cuahsihisfcelterids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisfcelterids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisfcelterids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py
deleted file mode 100644
index 5529ed68..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisfrcwqmids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisfrcwqmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfrcwqmids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfrcwqmids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfrcwqmids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfrcwqmids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisfrcwqmids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisfrcwqmids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisfrcwqmids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisfrcwqmids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisfrcwqmids0():
- containers = cuahsihisfrcwqmids0_getImage()
- harvest = cuahsihisfrcwqmids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisfrcwqmids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisfrcwqmids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisfrcwqmids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisfrcwqmids0")
- load_release = cuahsihisfrcwqmids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisfrcwqmids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisfrcwqmids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisfrcwqmids0_nabuprov(start=load_prune)
- load_org = cuahsihisfrcwqmids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisfrcwqmids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisfrcwqmids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py
deleted file mode 100644
index 8d042b85..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisghcnids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisghcnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisghcnids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisghcnids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisghcnids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisghcnids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisghcnids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisghcnids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisghcnids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisghcnids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisghcnids0():
- containers = cuahsihisghcnids0_getImage()
- harvest = cuahsihisghcnids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisghcnids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisghcnids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisghcnids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisghcnids0")
- load_release = cuahsihisghcnids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisghcnids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisghcnids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisghcnids0_nabuprov(start=load_prune)
- load_org = cuahsihisghcnids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisghcnids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisghcnids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py
deleted file mode 100644
index a5966474..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisglacialridgeids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisglacialridgeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglacialridgeids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglacialridgeids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglacialridgeids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglacialridgeids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglacialridgeids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglacialridgeids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisglacialridgeids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisglacialridgeids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisglacialridgeids0():
- containers = cuahsihisglacialridgeids0_getImage()
- harvest = cuahsihisglacialridgeids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisglacialridgeids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisglacialridgeids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisglacialridgeids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisglacialridgeids0")
- load_release = cuahsihisglacialridgeids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisglacialridgeids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisglacialridgeids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisglacialridgeids0_nabuprov(start=load_prune)
- load_org = cuahsihisglacialridgeids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisglacialridgeids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisglacialridgeids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py
deleted file mode 100644
index 418482c3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisgleonauburnids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisgleonauburnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonauburnids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonauburnids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonauburnids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonauburnids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonauburnids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonauburnids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisgleonauburnids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisgleonauburnids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisgleonauburnids0():
- containers = cuahsihisgleonauburnids0_getImage()
- harvest = cuahsihisgleonauburnids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisgleonauburnids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisgleonauburnids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisgleonauburnids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisgleonauburnids0")
- load_release = cuahsihisgleonauburnids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisgleonauburnids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisgleonauburnids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisgleonauburnids0_nabuprov(start=load_prune)
- load_org = cuahsihisgleonauburnids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisgleonauburnids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisgleonauburnids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py
deleted file mode 100644
index 88ce2d7b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisgleondorsetids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisgleondorsetids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleondorsetids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleondorsetids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleondorsetids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleondorsetids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleondorsetids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleondorsetids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisgleondorsetids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisgleondorsetids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisgleondorsetids0():
- containers = cuahsihisgleondorsetids0_getImage()
- harvest = cuahsihisgleondorsetids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisgleondorsetids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisgleondorsetids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisgleondorsetids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisgleondorsetids0")
- load_release = cuahsihisgleondorsetids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisgleondorsetids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisgleondorsetids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisgleondorsetids0_nabuprov(start=load_prune)
- load_org = cuahsihisgleondorsetids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisgleondorsetids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisgleondorsetids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py
deleted file mode 100644
index 13ec911c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisgleonlakeannieids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisgleonlakeannieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonlakeannieids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonlakeannieids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonlakeannieids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonlakeannieids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonlakeannieids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonlakeannieids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisgleonlakeannieids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisgleonlakeannieids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisgleonlakeannieids0():
- containers = cuahsihisgleonlakeannieids0_getImage()
- harvest = cuahsihisgleonlakeannieids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisgleonlakeannieids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisgleonlakeannieids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisgleonlakeannieids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisgleonlakeannieids0")
- load_release = cuahsihisgleonlakeannieids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisgleonlakeannieids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisgleonlakeannieids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisgleonlakeannieids0_nabuprov(start=load_prune)
- load_org = cuahsihisgleonlakeannieids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisgleonlakeannieids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisgleonlakeannieids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py
deleted file mode 100644
index 1560b6da..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisgleonsunapeeids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisgleonsunapeeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonsunapeeids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonsunapeeids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonsunapeeids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonsunapeeids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgleonsunapeeids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgleonsunapeeids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisgleonsunapeeids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisgleonsunapeeids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisgleonsunapeeids0():
- containers = cuahsihisgleonsunapeeids0_getImage()
- harvest = cuahsihisgleonsunapeeids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisgleonsunapeeids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisgleonsunapeeids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisgleonsunapeeids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisgleonsunapeeids0")
- load_release = cuahsihisgleonsunapeeids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisgleonsunapeeids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisgleonsunapeeids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisgleonsunapeeids0_nabuprov(start=load_prune)
- load_org = cuahsihisgleonsunapeeids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisgleonsunapeeids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisgleonsunapeeids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py
deleted file mode 100644
index f4654481..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisglobalriversobservatoryids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisglobalriversobservatoryids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglobalriversobservatoryids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglobalriversobservatoryids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglobalriversobservatoryids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglobalriversobservatoryids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisglobalriversobservatoryids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisglobalriversobservatoryids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisglobalriversobservatoryids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisglobalriversobservatoryids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisglobalriversobservatoryids0():
- containers = cuahsihisglobalriversobservatoryids0_getImage()
- harvest = cuahsihisglobalriversobservatoryids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisglobalriversobservatoryids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisglobalriversobservatoryids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisglobalriversobservatoryids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisglobalriversobservatoryids0")
- load_release = cuahsihisglobalriversobservatoryids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisglobalriversobservatoryids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisglobalriversobservatoryids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisglobalriversobservatoryids0_nabuprov(start=load_prune)
- load_org = cuahsihisglobalriversobservatoryids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisglobalriversobservatoryids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisglobalriversobservatoryids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py
deleted file mode 100644
index 1b78b0e7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisgonggaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisgonggaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgonggaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgonggaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgonggaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgonggaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisgonggaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisgonggaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisgonggaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisgonggaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisgonggaids0():
- containers = cuahsihisgonggaids0_getImage()
- harvest = cuahsihisgonggaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisgonggaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisgonggaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisgonggaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisgonggaids0")
- load_release = cuahsihisgonggaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisgonggaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisgonggaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisgonggaids0_nabuprov(start=load_prune)
- load_org = cuahsihisgonggaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisgonggaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisgonggaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py
deleted file mode 100644
index 5ac06488..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihishassbergeids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_uploadrelease(context):
- returned_value = postRelease("cuahsihishassbergeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishassbergeids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishassbergeids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishassbergeids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishassbergeids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishassbergeids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishassbergeids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihishassbergeids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihishassbergeids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihishassbergeids0():
- containers = cuahsihishassbergeids0_getImage()
- harvest = cuahsihishassbergeids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihishassbergeids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihishassbergeids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihishassbergeids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihishassbergeids0")
- load_release = cuahsihishassbergeids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihishassbergeids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihishassbergeids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihishassbergeids0_nabuprov(start=load_prune)
- load_org = cuahsihishassbergeids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihishassbergeids0_missingreport_graph(start=load_org)
- report_graph=cuahsihishassbergeids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py
deleted file mode 100644
index a3c669b4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihishydrodataczdids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_uploadrelease(context):
- returned_value = postRelease("cuahsihishydrodataczdids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczdids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczdids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczdids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczdids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczdids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczdids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihishydrodataczdids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihishydrodataczdids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihishydrodataczdids0():
- containers = cuahsihishydrodataczdids0_getImage()
- harvest = cuahsihishydrodataczdids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihishydrodataczdids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihishydrodataczdids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihishydrodataczdids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihishydrodataczdids0")
- load_release = cuahsihishydrodataczdids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihishydrodataczdids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihishydrodataczdids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihishydrodataczdids0_nabuprov(start=load_prune)
- load_org = cuahsihishydrodataczdids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihishydrodataczdids0_missingreport_graph(start=load_org)
- report_graph=cuahsihishydrodataczdids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py
deleted file mode 100644
index 0b18da56..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihishydrodataczhrids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_uploadrelease(context):
- returned_value = postRelease("cuahsihishydrodataczhrids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczhrids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczhrids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczhrids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczhrids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihishydrodataczhrids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihishydrodataczhrids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihishydrodataczhrids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihishydrodataczhrids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihishydrodataczhrids0():
- containers = cuahsihishydrodataczhrids0_getImage()
- harvest = cuahsihishydrodataczhrids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihishydrodataczhrids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihishydrodataczhrids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihishydrodataczhrids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihishydrodataczhrids0")
- load_release = cuahsihishydrodataczhrids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihishydrodataczhrids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihishydrodataczhrids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihishydrodataczhrids0_nabuprov(start=load_prune)
- load_org = cuahsihishydrodataczhrids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihishydrodataczhrids0_missingreport_graph(start=load_org)
- report_graph=cuahsihishydrodataczhrids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py
deleted file mode 100644
index fbbf3b93..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisieeratwilkesuniversityids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisieeratwilkesuniversityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisieeratwilkesuniversityids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisieeratwilkesuniversityids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisieeratwilkesuniversityids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisieeratwilkesuniversityids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisieeratwilkesuniversityids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisieeratwilkesuniversityids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisieeratwilkesuniversityids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisieeratwilkesuniversityids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisieeratwilkesuniversityids0():
- containers = cuahsihisieeratwilkesuniversityids0_getImage()
- harvest = cuahsihisieeratwilkesuniversityids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisieeratwilkesuniversityids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisieeratwilkesuniversityids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisieeratwilkesuniversityids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisieeratwilkesuniversityids0")
- load_release = cuahsihisieeratwilkesuniversityids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisieeratwilkesuniversityids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisieeratwilkesuniversityids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisieeratwilkesuniversityids0_nabuprov(start=load_prune)
- load_org = cuahsihisieeratwilkesuniversityids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisieeratwilkesuniversityids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisieeratwilkesuniversityids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py
deleted file mode 100644
index 0fb7e4b2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisirwaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisirwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisirwaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisirwaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisirwaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisirwaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisirwaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisirwaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisirwaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisirwaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisirwaids0():
- containers = cuahsihisirwaids0_getImage()
- harvest = cuahsihisirwaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisirwaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisirwaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisirwaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisirwaids0")
- load_release = cuahsihisirwaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisirwaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisirwaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisirwaids0_nabuprov(start=load_prune)
- load_org = cuahsihisirwaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisirwaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisirwaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py
deleted file mode 100644
index e58f4e18..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisisbenaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisisbenaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisisbenaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisisbenaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisisbenaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisisbenaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisisbenaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisisbenaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisisbenaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisisbenaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisisbenaids0():
- containers = cuahsihisisbenaids0_getImage()
- harvest = cuahsihisisbenaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisisbenaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisisbenaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisisbenaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisisbenaids0")
- load_release = cuahsihisisbenaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisisbenaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisisbenaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisisbenaids0_nabuprov(start=load_prune)
- load_org = cuahsihisisbenaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisisbenaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisisbenaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py
deleted file mode 100644
index 5974a395..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihiskansasweatherdataids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_uploadrelease(context):
- returned_value = postRelease("cuahsihiskansasweatherdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiskansasweatherdataids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiskansasweatherdataids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiskansasweatherdataids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiskansasweatherdataids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihiskansasweatherdataids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihiskansasweatherdataids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihiskansasweatherdataids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihiskansasweatherdataids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihiskansasweatherdataids0():
- containers = cuahsihiskansasweatherdataids0_getImage()
- harvest = cuahsihiskansasweatherdataids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihiskansasweatherdataids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihiskansasweatherdataids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihiskansasweatherdataids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihiskansasweatherdataids0")
- load_release = cuahsihiskansasweatherdataids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihiskansasweatherdataids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihiskansasweatherdataids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihiskansasweatherdataids0_nabuprov(start=load_prune)
- load_org = cuahsihiskansasweatherdataids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihiskansasweatherdataids0_missingreport_graph(start=load_org)
- report_graph=cuahsihiskansasweatherdataids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py
deleted file mode 100644
index a54be7f1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihislaselvastreamdischargeids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_uploadrelease(context):
- returned_value = postRelease("cuahsihislaselvastreamdischargeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislaselvastreamdischargeids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislaselvastreamdischargeids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislaselvastreamdischargeids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislaselvastreamdischargeids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislaselvastreamdischargeids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislaselvastreamdischargeids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihislaselvastreamdischargeids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihislaselvastreamdischargeids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihislaselvastreamdischargeids0():
- containers = cuahsihislaselvastreamdischargeids0_getImage()
- harvest = cuahsihislaselvastreamdischargeids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihislaselvastreamdischargeids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihislaselvastreamdischargeids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihislaselvastreamdischargeids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihislaselvastreamdischargeids0")
- load_release = cuahsihislaselvastreamdischargeids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihislaselvastreamdischargeids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihislaselvastreamdischargeids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihislaselvastreamdischargeids0_nabuprov(start=load_prune)
- load_org = cuahsihislaselvastreamdischargeids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihislaselvastreamdischargeids0_missingreport_graph(start=load_org)
- report_graph=cuahsihislaselvastreamdischargeids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py
deleted file mode 100644
index c744b8b5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihislczoodm2ids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_uploadrelease(context):
- returned_value = postRelease("cuahsihislczoodm2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislczoodm2ids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislczoodm2ids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislczoodm2ids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislczoodm2ids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislczoodm2ids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislczoodm2ids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihislczoodm2ids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihislczoodm2ids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihislczoodm2ids0():
- containers = cuahsihislczoodm2ids0_getImage()
- harvest = cuahsihislczoodm2ids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihislczoodm2ids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihislczoodm2ids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihislczoodm2ids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihislczoodm2ids0")
- load_release = cuahsihislczoodm2ids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihislczoodm2ids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihislczoodm2ids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihislczoodm2ids0_nabuprov(start=load_prune)
- load_org = cuahsihislczoodm2ids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihislczoodm2ids0_missingreport_graph(start=load_org)
- report_graph=cuahsihislczoodm2ids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py
deleted file mode 100644
index 68aead74..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihislittlebearriverids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_uploadrelease(context):
- returned_value = postRelease("cuahsihislittlebearriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislittlebearriverids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislittlebearriverids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislittlebearriverids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislittlebearriverids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislittlebearriverids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislittlebearriverids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihislittlebearriverids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihislittlebearriverids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihislittlebearriverids0():
- containers = cuahsihislittlebearriverids0_getImage()
- harvest = cuahsihislittlebearriverids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihislittlebearriverids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihislittlebearriverids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihislittlebearriverids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihislittlebearriverids0")
- load_release = cuahsihislittlebearriverids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihislittlebearriverids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihislittlebearriverids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihislittlebearriverids0_nabuprov(start=load_prune)
- load_org = cuahsihislittlebearriverids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihislittlebearriverids0_missingreport_graph(start=load_org)
- report_graph=cuahsihislittlebearriverids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py
deleted file mode 100644
index 75145e02..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisloganrivergamutids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisloganrivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganrivergamutids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganrivergamutids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganrivergamutids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganrivergamutids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganrivergamutids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganrivergamutids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisloganrivergamutids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisloganrivergamutids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisloganrivergamutids0():
- containers = cuahsihisloganrivergamutids0_getImage()
- harvest = cuahsihisloganrivergamutids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisloganrivergamutids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisloganrivergamutids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisloganrivergamutids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisloganrivergamutids0")
- load_release = cuahsihisloganrivergamutids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisloganrivergamutids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisloganrivergamutids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisloganrivergamutids0_nabuprov(start=load_prune)
- load_org = cuahsihisloganrivergamutids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisloganrivergamutids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisloganrivergamutids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py
deleted file mode 100644
index cb43eeef..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisloganriverids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisloganriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganriverids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganriverids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganriverids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganriverids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisloganriverids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisloganriverids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisloganriverids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisloganriverids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisloganriverids0():
- containers = cuahsihisloganriverids0_getImage()
- harvest = cuahsihisloganriverids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisloganriverids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisloganriverids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisloganriverids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisloganriverids0")
- load_release = cuahsihisloganriverids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisloganriverids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisloganriverids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisloganriverids0_nabuprov(start=load_prune)
- load_org = cuahsihisloganriverids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisloganriverids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisloganriverids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py
deleted file mode 100644
index 2c4810a4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihislterntlwoodruffids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_uploadrelease(context):
- returned_value = postRelease("cuahsihislterntlwoodruffids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislterntlwoodruffids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislterntlwoodruffids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislterntlwoodruffids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislterntlwoodruffids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihislterntlwoodruffids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihislterntlwoodruffids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihislterntlwoodruffids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihislterntlwoodruffids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihislterntlwoodruffids0():
- containers = cuahsihislterntlwoodruffids0_getImage()
- harvest = cuahsihislterntlwoodruffids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihislterntlwoodruffids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihislterntlwoodruffids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihislterntlwoodruffids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihislterntlwoodruffids0")
- load_release = cuahsihislterntlwoodruffids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihislterntlwoodruffids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihislterntlwoodruffids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihislterntlwoodruffids0_nabuprov(start=load_prune)
- load_org = cuahsihislterntlwoodruffids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihislterntlwoodruffids0_missingreport_graph(start=load_org)
- report_graph=cuahsihislterntlwoodruffids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py
deleted file mode 100644
index 44cdaa25..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisluwlids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisluwlids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisluwlids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisluwlids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisluwlids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisluwlids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisluwlids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisluwlids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisluwlids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisluwlids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisluwlids0():
- containers = cuahsihisluwlids0_getImage()
- harvest = cuahsihisluwlids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisluwlids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisluwlids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisluwlids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisluwlids0")
- load_release = cuahsihisluwlids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisluwlids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisluwlids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisluwlids0_nabuprov(start=load_prune)
- load_org = cuahsihisluwlids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisluwlids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisluwlids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py
deleted file mode 100644
index 72bd6b1e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismaaeriids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismaaeriids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismaaeriids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismaaeriids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismaaeriids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismaaeriids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismaaeriids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismaaeriids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismaaeriids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismaaeriids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismaaeriids0():
- containers = cuahsihismaaeriids0_getImage()
- harvest = cuahsihismaaeriids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismaaeriids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismaaeriids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismaaeriids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismaaeriids0")
- load_release = cuahsihismaaeriids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismaaeriids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismaaeriids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismaaeriids0_nabuprov(start=load_prune)
- load_org = cuahsihismaaeriids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismaaeriids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismaaeriids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py
deleted file mode 100644
index 1bda0d7a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismazarriverprojectids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismazarriverprojectids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismazarriverprojectids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismazarriverprojectids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismazarriverprojectids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismazarriverprojectids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismazarriverprojectids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismazarriverprojectids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismazarriverprojectids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismazarriverprojectids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismazarriverprojectids0():
- containers = cuahsihismazarriverprojectids0_getImage()
- harvest = cuahsihismazarriverprojectids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismazarriverprojectids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismazarriverprojectids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismazarriverprojectids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismazarriverprojectids0")
- load_release = cuahsihismazarriverprojectids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismazarriverprojectids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismazarriverprojectids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismazarriverprojectids0_nabuprov(start=load_prune)
- load_org = cuahsihismazarriverprojectids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismazarriverprojectids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismazarriverprojectids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py
deleted file mode 100644
index 8f2dd38c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismmaatacamaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismmaatacamaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismmaatacamaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismmaatacamaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismmaatacamaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismmaatacamaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismmaatacamaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismmaatacamaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismmaatacamaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismmaatacamaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismmaatacamaids0():
- containers = cuahsihismmaatacamaids0_getImage()
- harvest = cuahsihismmaatacamaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismmaatacamaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismmaatacamaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismmaatacamaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismmaatacamaids0")
- load_release = cuahsihismmaatacamaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismmaatacamaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismmaatacamaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismmaatacamaids0_nabuprov(start=load_prune)
- load_org = cuahsihismmaatacamaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismmaatacamaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismmaatacamaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py
deleted file mode 100644
index bf374bb7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismobilecrowdhydrologyids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismobilecrowdhydrologyids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismobilecrowdhydrologyids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismobilecrowdhydrologyids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismobilecrowdhydrologyids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismobilecrowdhydrologyids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismobilecrowdhydrologyids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismobilecrowdhydrologyids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismobilecrowdhydrologyids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismobilecrowdhydrologyids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismobilecrowdhydrologyids0():
- containers = cuahsihismobilecrowdhydrologyids0_getImage()
- harvest = cuahsihismobilecrowdhydrologyids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismobilecrowdhydrologyids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismobilecrowdhydrologyids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismobilecrowdhydrologyids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismobilecrowdhydrologyids0")
- load_release = cuahsihismobilecrowdhydrologyids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismobilecrowdhydrologyids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismobilecrowdhydrologyids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismobilecrowdhydrologyids0_nabuprov(start=load_prune)
- load_org = cuahsihismobilecrowdhydrologyids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismobilecrowdhydrologyids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismobilecrowdhydrologyids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py
deleted file mode 100644
index 960b60d8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismopexids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismopexids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismopexids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismopexids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismopexids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismopexids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismopexids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismopexids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismopexids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismopexids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismopexids0():
- containers = cuahsihismopexids0_getImage()
- harvest = cuahsihismopexids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismopexids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismopexids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismopexids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismopexids0")
- load_release = cuahsihismopexids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismopexids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismopexids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismopexids0_nabuprov(start=load_prune)
- load_org = cuahsihismopexids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismopexids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismopexids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py
deleted file mode 100644
index 47e28927..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismuddyriverids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismuddyriverids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismuddyriverids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismuddyriverids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismuddyriverids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismuddyriverids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismuddyriverids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismuddyriverids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismuddyriverids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismuddyriverids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismuddyriverids0():
- containers = cuahsihismuddyriverids0_getImage()
- harvest = cuahsihismuddyriverids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismuddyriverids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismuddyriverids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismuddyriverids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismuddyriverids0")
- load_release = cuahsihismuddyriverids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismuddyriverids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismuddyriverids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismuddyriverids0_nabuprov(start=load_prune)
- load_org = cuahsihismuddyriverids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismuddyriverids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismuddyriverids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py
deleted file mode 100644
index 261e7f5d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismudlakeids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismudlakeids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismudlakeids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismudlakeids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismudlakeids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismudlakeids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismudlakeids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismudlakeids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismudlakeids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismudlakeids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismudlakeids0():
- containers = cuahsihismudlakeids0_getImage()
- harvest = cuahsihismudlakeids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismudlakeids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismudlakeids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismudlakeids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismudlakeids0")
- load_release = cuahsihismudlakeids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismudlakeids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismudlakeids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismudlakeids0_nabuprov(start=load_prune)
- load_org = cuahsihismudlakeids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismudlakeids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismudlakeids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py
deleted file mode 100644
index ddf22ca5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismwdisids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismwdisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwdisids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwdisids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwdisids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwdisids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwdisids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwdisids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismwdisids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismwdisids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismwdisids0():
- containers = cuahsihismwdisids0_getImage()
- harvest = cuahsihismwdisids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismwdisids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismwdisids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismwdisids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismwdisids0")
- load_release = cuahsihismwdisids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismwdisids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismwdisids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismwdisids0_nabuprov(start=load_prune)
- load_org = cuahsihismwdisids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismwdisids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismwdisids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py
deleted file mode 100644
index fe282d56..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihismwraids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_uploadrelease(context):
- returned_value = postRelease("cuahsihismwraids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwraids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwraids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwraids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwraids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihismwraids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihismwraids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihismwraids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihismwraids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihismwraids0():
- containers = cuahsihismwraids0_getImage()
- harvest = cuahsihismwraids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihismwraids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihismwraids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihismwraids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihismwraids0")
- load_release = cuahsihismwraids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihismwraids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihismwraids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihismwraids0_nabuprov(start=load_prune)
- load_org = cuahsihismwraids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihismwraids0_missingreport_graph(start=load_org)
- report_graph=cuahsihismwraids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py
deleted file mode 100644
index f2b9f904..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnashrwaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnashrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnashrwaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnashrwaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnashrwaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnashrwaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnashrwaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnashrwaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnashrwaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnashrwaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnashrwaids0():
- containers = cuahsihisnashrwaids0_getImage()
- harvest = cuahsihisnashrwaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnashrwaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnashrwaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnashrwaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnashrwaids0")
- load_release = cuahsihisnashrwaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnashrwaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnashrwaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnashrwaids0_nabuprov(start=load_prune)
- load_org = cuahsihisnashrwaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnashrwaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnashrwaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py
deleted file mode 100644
index b550f29e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnceiww2ids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnceiww2ids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnceiww2ids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnceiww2ids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnceiww2ids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnceiww2ids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnceiww2ids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnceiww2ids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnceiww2ids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnceiww2ids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnceiww2ids0():
- containers = cuahsihisnceiww2ids0_getImage()
- harvest = cuahsihisnceiww2ids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnceiww2ids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnceiww2ids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnceiww2ids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnceiww2ids0")
- load_release = cuahsihisnceiww2ids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnceiww2ids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnceiww2ids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnceiww2ids0_nabuprov(start=load_prune)
- load_org = cuahsihisnceiww2ids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnceiww2ids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnceiww2ids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py
deleted file mode 100644
index bf7c8463..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisneonids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisneonids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisneonids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisneonids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisneonids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisneonids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisneonids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisneonids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisneonids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisneonids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisneonids0():
- containers = cuahsihisneonids0_getImage()
- harvest = cuahsihisneonids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisneonids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisneonids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisneonids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisneonids0")
- load_release = cuahsihisneonids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisneonids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisneonids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisneonids0_nabuprov(start=load_prune)
- load_org = cuahsihisneonids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisneonids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisneonids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py
deleted file mode 100644
index 80172dc8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnevadosids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnevadosids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevadosids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevadosids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevadosids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevadosids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevadosids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevadosids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnevadosids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnevadosids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnevadosids0():
- containers = cuahsihisnevadosids0_getImage()
- harvest = cuahsihisnevadosids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnevadosids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnevadosids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnevadosids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnevadosids0")
- load_release = cuahsihisnevadosids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnevadosids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnevadosids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnevadosids0_nabuprov(start=load_prune)
- load_org = cuahsihisnevadosids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnevadosids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnevadosids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py
deleted file mode 100644
index d7ac8d3f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnevcanids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnevcanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevcanids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevcanids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevcanids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevcanids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnevcanids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnevcanids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnevcanids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnevcanids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnevcanids0():
- containers = cuahsihisnevcanids0_getImage()
- harvest = cuahsihisnevcanids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnevcanids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnevcanids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnevcanids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnevcanids0")
- load_release = cuahsihisnevcanids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnevcanids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnevcanids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnevcanids0_nabuprov(start=load_prune)
- load_org = cuahsihisnevcanids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnevcanids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnevcanids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py
deleted file mode 100644
index a2309479..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnewnids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnewnids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnewnids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnewnids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnewnids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnewnids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnewnids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnewnids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnewnids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnewnids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnewnids0():
- containers = cuahsihisnewnids0_getImage()
- harvest = cuahsihisnewnids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnewnids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnewnids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnewnids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnewnids0")
- load_release = cuahsihisnewnids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnewnids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnewnids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnewnids0_nabuprov(start=load_prune)
- load_org = cuahsihisnewnids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnewnids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnewnids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py
deleted file mode 100644
index f577368a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnhgswofids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnhgswofids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnhgswofids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnhgswofids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnhgswofids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnhgswofids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnhgswofids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnhgswofids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnhgswofids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnhgswofids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnhgswofids0():
- containers = cuahsihisnhgswofids0_getImage()
- harvest = cuahsihisnhgswofids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnhgswofids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnhgswofids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnhgswofids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnhgswofids0")
- load_release = cuahsihisnhgswofids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnhgswofids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnhgswofids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnhgswofids0_nabuprov(start=load_prune)
- load_org = cuahsihisnhgswofids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnhgswofids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnhgswofids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py
deleted file mode 100644
index 931b1b38..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisnooksackmicroclimatenetworkids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisnooksackmicroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnooksackmicroclimatenetworkids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnooksackmicroclimatenetworkids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnooksackmicroclimatenetworkids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnooksackmicroclimatenetworkids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisnooksackmicroclimatenetworkids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisnooksackmicroclimatenetworkids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisnooksackmicroclimatenetworkids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisnooksackmicroclimatenetworkids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisnooksackmicroclimatenetworkids0():
- containers = cuahsihisnooksackmicroclimatenetworkids0_getImage()
- harvest = cuahsihisnooksackmicroclimatenetworkids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisnooksackmicroclimatenetworkids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisnooksackmicroclimatenetworkids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisnooksackmicroclimatenetworkids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisnooksackmicroclimatenetworkids0")
- load_release = cuahsihisnooksackmicroclimatenetworkids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisnooksackmicroclimatenetworkids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisnooksackmicroclimatenetworkids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisnooksackmicroclimatenetworkids0_nabuprov(start=load_prune)
- load_org = cuahsihisnooksackmicroclimatenetworkids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisnooksackmicroclimatenetworkids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisnooksackmicroclimatenetworkids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py
deleted file mode 100644
index c8d538f2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisodmkentstateids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisodmkentstateids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisodmkentstateids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisodmkentstateids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisodmkentstateids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisodmkentstateids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisodmkentstateids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisodmkentstateids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisodmkentstateids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisodmkentstateids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisodmkentstateids0():
- containers = cuahsihisodmkentstateids0_getImage()
- harvest = cuahsihisodmkentstateids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisodmkentstateids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisodmkentstateids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisodmkentstateids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisodmkentstateids0")
- load_release = cuahsihisodmkentstateids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisodmkentstateids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisodmkentstateids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisodmkentstateids0_nabuprov(start=load_prune)
- load_org = cuahsihisodmkentstateids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisodmkentstateids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisodmkentstateids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py
deleted file mode 100644
index cdf26706..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisorsancohabids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisorsancohabids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisorsancohabids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisorsancohabids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisorsancohabids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisorsancohabids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisorsancohabids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisorsancohabids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisorsancohabids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisorsancohabids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisorsancohabids0():
- containers = cuahsihisorsancohabids0_getImage()
- harvest = cuahsihisorsancohabids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisorsancohabids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisorsancohabids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisorsancohabids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisorsancohabids0")
- load_release = cuahsihisorsancohabids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisorsancohabids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisorsancohabids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisorsancohabids0_nabuprov(start=load_prune)
- load_org = cuahsihisorsancohabids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisorsancohabids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisorsancohabids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py
deleted file mode 100644
index 84eb6651..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihispanolaodmids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_uploadrelease(context):
- returned_value = postRelease("cuahsihispanolaodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihispanolaodmids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihispanolaodmids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihispanolaodmids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihispanolaodmids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihispanolaodmids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihispanolaodmids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihispanolaodmids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihispanolaodmids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihispanolaodmids0():
- containers = cuahsihispanolaodmids0_getImage()
- harvest = cuahsihispanolaodmids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihispanolaodmids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihispanolaodmids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihispanolaodmids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihispanolaodmids0")
- load_release = cuahsihispanolaodmids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihispanolaodmids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihispanolaodmids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihispanolaodmids0_nabuprov(start=load_prune)
- load_org = cuahsihispanolaodmids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihispanolaodmids0_missingreport_graph(start=load_org)
- report_graph=cuahsihispanolaodmids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py
deleted file mode 100644
index 1aba516e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisparalanaturalezaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisparalanaturalezaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisparalanaturalezaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisparalanaturalezaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisparalanaturalezaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisparalanaturalezaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisparalanaturalezaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisparalanaturalezaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisparalanaturalezaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisparalanaturalezaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisparalanaturalezaids0():
- containers = cuahsihisparalanaturalezaids0_getImage()
- harvest = cuahsihisparalanaturalezaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisparalanaturalezaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisparalanaturalezaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisparalanaturalezaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisparalanaturalezaids0")
- load_release = cuahsihisparalanaturalezaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisparalanaturalezaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisparalanaturalezaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisparalanaturalezaids0_nabuprov(start=load_prune)
- load_org = cuahsihisparalanaturalezaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisparalanaturalezaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisparalanaturalezaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py
deleted file mode 100644
index 5862e9cb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisprovorivergamutids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisprovorivergamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisprovorivergamutids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisprovorivergamutids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisprovorivergamutids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisprovorivergamutids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisprovorivergamutids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisprovorivergamutids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisprovorivergamutids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisprovorivergamutids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisprovorivergamutids0():
- containers = cuahsihisprovorivergamutids0_getImage()
- harvest = cuahsihisprovorivergamutids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisprovorivergamutids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisprovorivergamutids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisprovorivergamutids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisprovorivergamutids0")
- load_release = cuahsihisprovorivergamutids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisprovorivergamutids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisprovorivergamutids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisprovorivergamutids0_nabuprov(start=load_prune)
- load_org = cuahsihisprovorivergamutids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisprovorivergamutids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisprovorivergamutids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py
deleted file mode 100644
index 4d4ffa61..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisredbuttecreekgamutids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisredbuttecreekgamutids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisredbuttecreekgamutids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisredbuttecreekgamutids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisredbuttecreekgamutids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisredbuttecreekgamutids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisredbuttecreekgamutids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisredbuttecreekgamutids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisredbuttecreekgamutids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisredbuttecreekgamutids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisredbuttecreekgamutids0():
- containers = cuahsihisredbuttecreekgamutids0_getImage()
- harvest = cuahsihisredbuttecreekgamutids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisredbuttecreekgamutids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisredbuttecreekgamutids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisredbuttecreekgamutids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisredbuttecreekgamutids0")
- load_release = cuahsihisredbuttecreekgamutids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisredbuttecreekgamutids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisredbuttecreekgamutids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisredbuttecreekgamutids0_nabuprov(start=load_prune)
- load_org = cuahsihisredbuttecreekgamutids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisredbuttecreekgamutids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisredbuttecreekgamutids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py
deleted file mode 100644
index 4567b8cc..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisrmblids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisrmblids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisrmblids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisrmblids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisrmblids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisrmblids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisrmblids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisrmblids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisrmblids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisrmblids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisrmblids0():
- containers = cuahsihisrmblids0_getImage()
- harvest = cuahsihisrmblids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisrmblids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisrmblids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisrmblids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisrmblids0")
- load_release = cuahsihisrmblids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisrmblids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisrmblids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisrmblids0_nabuprov(start=load_prune)
- load_org = cuahsihisrmblids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisrmblids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisrmblids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py
deleted file mode 100644
index 36095daf..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihissagehencreekids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_uploadrelease(context):
- returned_value = postRelease("cuahsihissagehencreekids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissagehencreekids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissagehencreekids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissagehencreekids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissagehencreekids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissagehencreekids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissagehencreekids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihissagehencreekids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihissagehencreekids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihissagehencreekids0():
- containers = cuahsihissagehencreekids0_getImage()
- harvest = cuahsihissagehencreekids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihissagehencreekids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihissagehencreekids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihissagehencreekids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihissagehencreekids0")
- load_release = cuahsihissagehencreekids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihissagehencreekids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihissagehencreekids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihissagehencreekids0_nabuprov(start=load_prune)
- load_org = cuahsihissagehencreekids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihissagehencreekids0_missingreport_graph(start=load_org)
- report_graph=cuahsihissagehencreekids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py
deleted file mode 100644
index 96432ecd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisscanids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisscanids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscanids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscanids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscanids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscanids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscanids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscanids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisscanids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisscanids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisscanids0():
- containers = cuahsihisscanids0_getImage()
- harvest = cuahsihisscanids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisscanids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisscanids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisscanids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisscanids0")
- load_release = cuahsihisscanids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisscanids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisscanids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisscanids0_nabuprov(start=load_prune)
- load_org = cuahsihisscanids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisscanids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisscanids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py
deleted file mode 100644
index 17111bde..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisscotlandnwisids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisscotlandnwisids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscotlandnwisids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscotlandnwisids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscotlandnwisids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscotlandnwisids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisscotlandnwisids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisscotlandnwisids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisscotlandnwisids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisscotlandnwisids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisscotlandnwisids0():
- containers = cuahsihisscotlandnwisids0_getImage()
- harvest = cuahsihisscotlandnwisids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisscotlandnwisids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisscotlandnwisids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisscotlandnwisids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisscotlandnwisids0")
- load_release = cuahsihisscotlandnwisids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisscotlandnwisids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisscotlandnwisids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisscotlandnwisids0_nabuprov(start=load_prune)
- load_org = cuahsihisscotlandnwisids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisscotlandnwisids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisscotlandnwisids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py
deleted file mode 100644
index e2b56ed5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisshalenetworkodmids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisshalenetworkodmids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisshalenetworkodmids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisshalenetworkodmids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisshalenetworkodmids0():
- containers = cuahsihisshalenetworkodmids0_getImage()
- harvest = cuahsihisshalenetworkodmids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisshalenetworkodmids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisshalenetworkodmids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisshalenetworkodmids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisshalenetworkodmids0")
- load_release = cuahsihisshalenetworkodmids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisshalenetworkodmids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisshalenetworkodmids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisshalenetworkodmids0_nabuprov(start=load_prune)
- load_org = cuahsihisshalenetworkodmids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisshalenetworkodmids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisshalenetworkodmids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py
deleted file mode 100644
index 53689929..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisshalenetworkodmids1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_uploadrelease(context):
- returned_value = postRelease("cuahsihisshalenetworkodmids1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisshalenetworkodmids1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisshalenetworkodmids1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisshalenetworkodmids1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisshalenetworkodmids1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisshalenetworkodmids1():
- containers = cuahsihisshalenetworkodmids1_getImage()
- harvest = cuahsihisshalenetworkodmids1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisshalenetworkodmids1_missingreport_s3(start=harvest)
- report_idstat = cuahsihisshalenetworkodmids1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisshalenetworkodmids1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisshalenetworkodmids1")
- load_release = cuahsihisshalenetworkodmids1_naburelease(start=harvest)
- load_uploadrelease = cuahsihisshalenetworkodmids1_uploadrelease(start=load_release)
-
- load_prune = cuahsihisshalenetworkodmids1_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisshalenetworkodmids1_nabuprov(start=load_prune)
- load_org = cuahsihisshalenetworkodmids1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisshalenetworkodmids1_missingreport_graph(start=load_org)
- report_graph=cuahsihisshalenetworkodmids1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py
deleted file mode 100644
index 3244cded..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisskcmilltownids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisskcmilltownids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisskcmilltownids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisskcmilltownids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisskcmilltownids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisskcmilltownids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisskcmilltownids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisskcmilltownids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisskcmilltownids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisskcmilltownids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisskcmilltownids0():
- containers = cuahsihisskcmilltownids0_getImage()
- harvest = cuahsihisskcmilltownids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisskcmilltownids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisskcmilltownids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisskcmilltownids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisskcmilltownids0")
- load_release = cuahsihisskcmilltownids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisskcmilltownids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisskcmilltownids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisskcmilltownids0_nabuprov(start=load_prune)
- load_org = cuahsihisskcmilltownids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisskcmilltownids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisskcmilltownids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py
deleted file mode 100644
index 94717ef7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihissnotelids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_uploadrelease(context):
- returned_value = postRelease("cuahsihissnotelids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissnotelids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissnotelids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissnotelids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissnotelids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihissnotelids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihissnotelids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihissnotelids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihissnotelids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihissnotelids0():
- containers = cuahsihissnotelids0_getImage()
- harvest = cuahsihissnotelids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihissnotelids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihissnotelids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihissnotelids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihissnotelids0")
- load_release = cuahsihissnotelids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihissnotelids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihissnotelids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihissnotelids0_nabuprov(start=load_prune)
- load_org = cuahsihissnotelids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihissnotelids0_missingreport_graph(start=load_org)
- report_graph=cuahsihissnotelids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py
deleted file mode 100644
index 01549a00..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisswedishmonitoringdataids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisswedishmonitoringdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisswedishmonitoringdataids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisswedishmonitoringdataids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisswedishmonitoringdataids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisswedishmonitoringdataids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisswedishmonitoringdataids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisswedishmonitoringdataids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisswedishmonitoringdataids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisswedishmonitoringdataids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisswedishmonitoringdataids0():
- containers = cuahsihisswedishmonitoringdataids0_getImage()
- harvest = cuahsihisswedishmonitoringdataids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisswedishmonitoringdataids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisswedishmonitoringdataids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisswedishmonitoringdataids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisswedishmonitoringdataids0")
- load_release = cuahsihisswedishmonitoringdataids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisswedishmonitoringdataids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisswedishmonitoringdataids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisswedishmonitoringdataids0_nabuprov(start=load_prune)
- load_org = cuahsihisswedishmonitoringdataids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisswedishmonitoringdataids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisswedishmonitoringdataids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py
deleted file mode 100644
index 00a25bea..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihistarlandwaterqualityids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_uploadrelease(context):
- returned_value = postRelease("cuahsihistarlandwaterqualityids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistarlandwaterqualityids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistarlandwaterqualityids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistarlandwaterqualityids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistarlandwaterqualityids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistarlandwaterqualityids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistarlandwaterqualityids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihistarlandwaterqualityids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihistarlandwaterqualityids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihistarlandwaterqualityids0():
- containers = cuahsihistarlandwaterqualityids0_getImage()
- harvest = cuahsihistarlandwaterqualityids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihistarlandwaterqualityids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihistarlandwaterqualityids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihistarlandwaterqualityids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihistarlandwaterqualityids0")
- load_release = cuahsihistarlandwaterqualityids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihistarlandwaterqualityids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihistarlandwaterqualityids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihistarlandwaterqualityids0_nabuprov(start=load_prune)
- load_org = cuahsihistarlandwaterqualityids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihistarlandwaterqualityids0_missingreport_graph(start=load_org)
- report_graph=cuahsihistarlandwaterqualityids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py
deleted file mode 100644
index 540be88d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihistncwaterdataids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_uploadrelease(context):
- returned_value = postRelease("cuahsihistncwaterdataids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistncwaterdataids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistncwaterdataids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistncwaterdataids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistncwaterdataids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistncwaterdataids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistncwaterdataids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihistncwaterdataids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihistncwaterdataids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihistncwaterdataids0():
- containers = cuahsihistncwaterdataids0_getImage()
- harvest = cuahsihistncwaterdataids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihistncwaterdataids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihistncwaterdataids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihistncwaterdataids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihistncwaterdataids0")
- load_release = cuahsihistncwaterdataids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihistncwaterdataids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihistncwaterdataids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihistncwaterdataids0_nabuprov(start=load_prune)
- load_org = cuahsihistncwaterdataids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihistncwaterdataids0_missingreport_graph(start=load_org)
- report_graph=cuahsihistncwaterdataids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py
deleted file mode 100644
index 50476d8f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihistrwaids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_uploadrelease(context):
- returned_value = postRelease("cuahsihistrwaids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistrwaids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistrwaids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistrwaids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistrwaids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistrwaids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistrwaids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihistrwaids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihistrwaids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihistrwaids0():
- containers = cuahsihistrwaids0_getImage()
- harvest = cuahsihistrwaids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihistrwaids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihistrwaids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihistrwaids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihistrwaids0")
- load_release = cuahsihistrwaids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihistrwaids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihistrwaids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihistrwaids0_nabuprov(start=load_prune)
- load_org = cuahsihistrwaids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihistrwaids0_missingreport_graph(start=load_org)
- report_graph=cuahsihistrwaids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py
deleted file mode 100644
index 32535a1f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihistuolumnemdwids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_uploadrelease(context):
- returned_value = postRelease("cuahsihistuolumnemdwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistuolumnemdwids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistuolumnemdwids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistuolumnemdwids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistuolumnemdwids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihistuolumnemdwids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihistuolumnemdwids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihistuolumnemdwids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihistuolumnemdwids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihistuolumnemdwids0():
- containers = cuahsihistuolumnemdwids0_getImage()
- harvest = cuahsihistuolumnemdwids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihistuolumnemdwids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihistuolumnemdwids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihistuolumnemdwids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihistuolumnemdwids0")
- load_release = cuahsihistuolumnemdwids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihistuolumnemdwids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihistuolumnemdwids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihistuolumnemdwids0_nabuprov(start=load_prune)
- load_org = cuahsihistuolumnemdwids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihistuolumnemdwids0_missingreport_graph(start=load_org)
- report_graph=cuahsihistuolumnemdwids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py
deleted file mode 100644
index b1a7ebb3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisubwpadids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisubwpadids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisubwpadids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisubwpadids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisubwpadids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisubwpadids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisubwpadids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisubwpadids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisubwpadids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisubwpadids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisubwpadids0():
- containers = cuahsihisubwpadids0_getImage()
- harvest = cuahsihisubwpadids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisubwpadids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisubwpadids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisubwpadids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisubwpadids0")
- load_release = cuahsihisubwpadids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisubwpadids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisubwpadids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisubwpadids0_nabuprov(start=load_prune)
- load_org = cuahsihisubwpadids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisubwpadids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisubwpadids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py
deleted file mode 100644
index 79ad1de7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisumbcgwids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisumbcgwids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcgwids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcgwids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcgwids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcgwids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcgwids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcgwids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisumbcgwids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisumbcgwids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisumbcgwids0():
- containers = cuahsihisumbcgwids0_getImage()
- harvest = cuahsihisumbcgwids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisumbcgwids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisumbcgwids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisumbcgwids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisumbcgwids0")
- load_release = cuahsihisumbcgwids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisumbcgwids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisumbcgwids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisumbcgwids0_nabuprov(start=load_prune)
- load_org = cuahsihisumbcgwids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisumbcgwids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisumbcgwids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py
deleted file mode 100644
index 7f338f30..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisumbcwqids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisumbcwqids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcwqids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcwqids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcwqids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcwqids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisumbcwqids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisumbcwqids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisumbcwqids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisumbcwqids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisumbcwqids0():
- containers = cuahsihisumbcwqids0_getImage()
- harvest = cuahsihisumbcwqids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisumbcwqids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisumbcwqids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisumbcwqids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisumbcwqids0")
- load_release = cuahsihisumbcwqids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisumbcwqids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisumbcwqids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisumbcwqids0_nabuprov(start=load_prune)
- load_org = cuahsihisumbcwqids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisumbcwqids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisumbcwqids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py
deleted file mode 100644
index d5885d6b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisunhsnowids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisunhsnowids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisunhsnowids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisunhsnowids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisunhsnowids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisunhsnowids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisunhsnowids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisunhsnowids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisunhsnowids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisunhsnowids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisunhsnowids0():
- containers = cuahsihisunhsnowids0_getImage()
- harvest = cuahsihisunhsnowids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisunhsnowids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisunhsnowids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisunhsnowids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisunhsnowids0")
- load_release = cuahsihisunhsnowids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisunhsnowids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisunhsnowids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisunhsnowids0_nabuprov(start=load_prune)
- load_org = cuahsihisunhsnowids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisunhsnowids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisunhsnowids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py
deleted file mode 100644
index e9033092..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisweiherbachids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisweiherbachids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisweiherbachids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisweiherbachids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisweiherbachids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisweiherbachids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisweiherbachids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisweiherbachids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisweiherbachids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisweiherbachids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisweiherbachids0():
- containers = cuahsihisweiherbachids0_getImage()
- harvest = cuahsihisweiherbachids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisweiherbachids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisweiherbachids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisweiherbachids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisweiherbachids0")
- load_release = cuahsihisweiherbachids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisweiherbachids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisweiherbachids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisweiherbachids0_nabuprov(start=load_prune)
- load_org = cuahsihisweiherbachids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisweiherbachids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisweiherbachids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py
deleted file mode 100644
index b675d549..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def cuahsihisyosemitehydroclimatenetworkids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_uploadrelease(context):
- returned_value = postRelease("cuahsihisyosemitehydroclimatenetworkids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisyosemitehydroclimatenetworkids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisyosemitehydroclimatenetworkids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisyosemitehydroclimatenetworkids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisyosemitehydroclimatenetworkids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def cuahsihisyosemitehydroclimatenetworkids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "cuahsihisyosemitehydroclimatenetworkids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="cuahsihisyosemitehydroclimatenetworkids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="cuahsihisyosemitehydroclimatenetworkids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_cuahsihisyosemitehydroclimatenetworkids0():
- containers = cuahsihisyosemitehydroclimatenetworkids0_getImage()
- harvest = cuahsihisyosemitehydroclimatenetworkids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = cuahsihisyosemitehydroclimatenetworkids0_missingreport_s3(start=harvest)
- report_idstat = cuahsihisyosemitehydroclimatenetworkids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = cuahsihisyosemitehydroclimatenetworkids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="cuahsihisyosemitehydroclimatenetworkids0")
- load_release = cuahsihisyosemitehydroclimatenetworkids0_naburelease(start=harvest)
- load_uploadrelease = cuahsihisyosemitehydroclimatenetworkids0_uploadrelease(start=load_release)
-
- load_prune = cuahsihisyosemitehydroclimatenetworkids0_nabu_prune(start=load_uploadrelease)
- load_prov = cuahsihisyosemitehydroclimatenetworkids0_nabuprov(start=load_prune)
- load_org = cuahsihisyosemitehydroclimatenetworkids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=cuahsihisyosemitehydroclimatenetworkids0_missingreport_graph(start=load_org)
- report_graph=cuahsihisyosemitehydroclimatenetworkids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py
deleted file mode 100644
index 8ce6f1c8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def dams0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def dams0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_naburelease(context):
- returned_value = gleanerio(context,("release"), "dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams0_uploadrelease(context):
- returned_value = postRelease("dams0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def dams0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="dams0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="dams0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_dams0():
- containers = dams0_getImage()
- harvest = dams0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = dams0_missingreport_s3(start=harvest)
- report_idstat = dams0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = dams0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="dams0")
- load_release = dams0_naburelease(start=harvest)
- load_uploadrelease = dams0_uploadrelease(start=load_release)
-
- load_prune = dams0_nabu_prune(start=load_uploadrelease)
- load_prov = dams0_nabuprov(start=load_prune)
- load_org = dams0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=dams0_missingreport_graph(start=load_org)
- report_graph=dams0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py
deleted file mode 100644
index be7d46d1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def dams1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def dams1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_naburelease(context):
- returned_value = gleanerio(context,("release"), "dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams1_uploadrelease(context):
- returned_value = postRelease("dams1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def dams1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def dams1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def dams1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "dams1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="dams1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="dams1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_dams1():
- containers = dams1_getImage()
- harvest = dams1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = dams1_missingreport_s3(start=harvest)
- report_idstat = dams1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = dams1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="dams1")
- load_release = dams1_naburelease(start=harvest)
- load_uploadrelease = dams1_uploadrelease(start=load_release)
-
- load_prune = dams1_nabu_prune(start=load_uploadrelease)
- load_prov = dams1_nabuprov(start=load_prune)
- load_org = dams1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=dams1_missingreport_graph(start=load_org)
- report_graph=dams1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py
deleted file mode 100644
index cab9e83c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def damspids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def damspids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def damspids0_uploadrelease(context):
- returned_value = postRelease("damspids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def damspids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "damspids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def damspids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "damspids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def damspids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "damspids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "damspids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def damspids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "damspids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="damspids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="damspids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_damspids0():
- containers = damspids0_getImage()
- harvest = damspids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = damspids0_missingreport_s3(start=harvest)
- report_idstat = damspids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = damspids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="damspids0")
- load_release = damspids0_naburelease(start=harvest)
- load_uploadrelease = damspids0_uploadrelease(start=load_release)
-
- load_prune = damspids0_nabu_prune(start=load_uploadrelease)
- load_prov = damspids0_nabuprov(start=load_prune)
- load_org = damspids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=damspids0_missingreport_graph(start=load_org)
- report_graph=damspids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py
deleted file mode 100644
index f2e1c016..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def demo0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def demo0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_naburelease(context):
- returned_value = gleanerio(context,("release"), "demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def demo0_uploadrelease(context):
- returned_value = postRelease("demo0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def demo0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "demo0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def demo0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "demo0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def demo0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "demo0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "demo0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def demo0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "demo0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="demo0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="demo0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_demo0():
- containers = demo0_getImage()
- harvest = demo0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = demo0_missingreport_s3(start=harvest)
- report_idstat = demo0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = demo0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="demo0")
- load_release = demo0_naburelease(start=harvest)
- load_uploadrelease = demo0_uploadrelease(start=load_release)
-
- load_prune = demo0_nabu_prune(start=load_uploadrelease)
- load_prov = demo0_nabuprov(start=load_prune)
- load_org = demo0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=demo0_missingreport_graph(start=load_org)
- report_graph=demo0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py
deleted file mode 100644
index 8e61d057..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def gfv11pois0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_naburelease(context):
- returned_value = gleanerio(context,("release"), "gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_uploadrelease(context):
- returned_value = postRelease("gfv11pois0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="gfv11pois0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="gfv11pois0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_gfv11pois0():
- containers = gfv11pois0_getImage()
- harvest = gfv11pois0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = gfv11pois0_missingreport_s3(start=harvest)
- report_idstat = gfv11pois0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = gfv11pois0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="gfv11pois0")
- load_release = gfv11pois0_naburelease(start=harvest)
- load_uploadrelease = gfv11pois0_uploadrelease(start=load_release)
-
- load_prune = gfv11pois0_nabu_prune(start=load_uploadrelease)
- load_prov = gfv11pois0_nabuprov(start=load_prune)
- load_org = gfv11pois0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=gfv11pois0_missingreport_graph(start=load_org)
- report_graph=gfv11pois0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py
deleted file mode 100644
index 74d86fb5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def gfv11pois1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_naburelease(context):
- returned_value = gleanerio(context,("release"), "gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_uploadrelease(context):
- returned_value = postRelease("gfv11pois1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def gfv11pois1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "gfv11pois1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="gfv11pois1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="gfv11pois1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_gfv11pois1():
- containers = gfv11pois1_getImage()
- harvest = gfv11pois1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = gfv11pois1_missingreport_s3(start=harvest)
- report_idstat = gfv11pois1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = gfv11pois1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="gfv11pois1")
- load_release = gfv11pois1_naburelease(start=harvest)
- load_uploadrelease = gfv11pois1_uploadrelease(start=load_release)
-
- load_prune = gfv11pois1_nabu_prune(start=load_uploadrelease)
- load_prov = gfv11pois1_nabuprov(start=load_prune)
- load_org = gfv11pois1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=gfv11pois1_missingreport_graph(start=load_org)
- report_graph=gfv11pois1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py
deleted file mode 100644
index b32e4fa7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hmw0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hmw0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_naburelease(context):
- returned_value = gleanerio(context,("release"), "hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw0_uploadrelease(context):
- returned_value = postRelease("hmw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hmw0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hmw0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hmw0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hmw0():
- containers = hmw0_getImage()
- harvest = hmw0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hmw0_missingreport_s3(start=harvest)
- report_idstat = hmw0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hmw0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hmw0")
- load_release = hmw0_naburelease(start=harvest)
- load_uploadrelease = hmw0_uploadrelease(start=load_release)
-
- load_prune = hmw0_nabu_prune(start=load_uploadrelease)
- load_prov = hmw0_nabuprov(start=load_prune)
- load_org = hmw0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hmw0_missingreport_graph(start=load_org)
- report_graph=hmw0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py
deleted file mode 100644
index 1511308d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hmw1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hmw1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_naburelease(context):
- returned_value = gleanerio(context,("release"), "hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw1_uploadrelease(context):
- returned_value = postRelease("hmw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hmw1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hmw1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hmw1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hmw1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hmw1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hmw1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hmw1():
- containers = hmw1_getImage()
- harvest = hmw1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hmw1_missingreport_s3(start=harvest)
- report_idstat = hmw1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hmw1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hmw1")
- load_release = hmw1_naburelease(start=harvest)
- load_uploadrelease = hmw1_uploadrelease(start=load_release)
-
- load_prune = hmw1_nabu_prune(start=load_uploadrelease)
- load_prov = hmw1_nabuprov(start=load_prune)
- load_org = hmw1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hmw1_missingreport_graph(start=load_org)
- report_graph=hmw1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py
deleted file mode 100644
index eccccfb8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hu020_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hu020_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_naburelease(context):
- returned_value = gleanerio(context,("release"), "hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu020_uploadrelease(context):
- returned_value = postRelease("hu020")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hu020_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu020"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu020_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu020"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu020_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu020"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu020"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu020_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu020"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hu020"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hu020"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hu020():
- containers = hu020_getImage()
- harvest = hu020_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hu020_missingreport_s3(start=harvest)
- report_idstat = hu020_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hu020_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hu020")
- load_release = hu020_naburelease(start=harvest)
- load_uploadrelease = hu020_uploadrelease(start=load_release)
-
- load_prune = hu020_nabu_prune(start=load_uploadrelease)
- load_prov = hu020_nabuprov(start=load_prune)
- load_org = hu020_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hu020_missingreport_graph(start=load_org)
- report_graph=hu020_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py
deleted file mode 100644
index 6173a826..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hu040_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hu040_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_naburelease(context):
- returned_value = gleanerio(context,("release"), "hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu040_uploadrelease(context):
- returned_value = postRelease("hu040")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hu040_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu040"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu040_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu040"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu040_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu040"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu040"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu040_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu040"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hu040"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hu040"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hu040():
- containers = hu040_getImage()
- harvest = hu040_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hu040_missingreport_s3(start=harvest)
- report_idstat = hu040_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hu040_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hu040")
- load_release = hu040_naburelease(start=harvest)
- load_uploadrelease = hu040_uploadrelease(start=load_release)
-
- load_prune = hu040_nabu_prune(start=load_uploadrelease)
- load_prov = hu040_nabuprov(start=load_prune)
- load_org = hu040_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hu040_missingreport_graph(start=load_org)
- report_graph=hu040_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py
deleted file mode 100644
index 97e5300c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hu060_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hu060_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_naburelease(context):
- returned_value = gleanerio(context,("release"), "hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu060_uploadrelease(context):
- returned_value = postRelease("hu060")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hu060_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu060"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu060_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu060"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu060_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu060"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu060"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu060_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu060"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hu060"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hu060"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hu060():
- containers = hu060_getImage()
- harvest = hu060_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hu060_missingreport_s3(start=harvest)
- report_idstat = hu060_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hu060_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hu060")
- load_release = hu060_naburelease(start=harvest)
- load_uploadrelease = hu060_uploadrelease(start=load_release)
-
- load_prune = hu060_nabu_prune(start=load_uploadrelease)
- load_prov = hu060_nabuprov(start=load_prune)
- load_org = hu060_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hu060_missingreport_graph(start=load_org)
- report_graph=hu060_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py
deleted file mode 100644
index 6b84e353..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hu080_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hu080_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_naburelease(context):
- returned_value = gleanerio(context,("release"), "hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu080_uploadrelease(context):
- returned_value = postRelease("hu080")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hu080_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu080"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu080_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu080"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu080_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu080"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu080"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu080_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu080"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hu080"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hu080"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hu080():
- containers = hu080_getImage()
- harvest = hu080_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hu080_missingreport_s3(start=harvest)
- report_idstat = hu080_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hu080_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hu080")
- load_release = hu080_naburelease(start=harvest)
- load_uploadrelease = hu080_uploadrelease(start=load_release)
-
- load_prune = hu080_nabu_prune(start=load_uploadrelease)
- load_prov = hu080_nabuprov(start=load_prune)
- load_org = hu080_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hu080_missingreport_graph(start=load_org)
- report_graph=hu080_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py
deleted file mode 100644
index 376573aa..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hu100_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hu100_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_naburelease(context):
- returned_value = gleanerio(context,("release"), "hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu100_uploadrelease(context):
- returned_value = postRelease("hu100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hu100_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu100"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu100_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu100"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hu100_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu100"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu100"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hu100_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hu100"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hu100"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hu100"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hu100():
- containers = hu100_getImage()
- harvest = hu100_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hu100_missingreport_s3(start=harvest)
- report_idstat = hu100_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hu100_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hu100")
- load_release = hu100_naburelease(start=harvest)
- load_uploadrelease = hu100_uploadrelease(start=load_release)
-
- load_prune = hu100_nabu_prune(start=load_uploadrelease)
- load_prov = hu100_nabuprov(start=load_prune)
- load_org = hu100_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hu100_missingreport_graph(start=load_org)
- report_graph=hu100_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py
deleted file mode 100644
index 17204a23..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def huc12pp0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def huc12pp0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_naburelease(context):
- returned_value = gleanerio(context,("release"), "huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp0_uploadrelease(context):
- returned_value = postRelease("huc12pp0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="huc12pp0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="huc12pp0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_huc12pp0():
- containers = huc12pp0_getImage()
- harvest = huc12pp0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = huc12pp0_missingreport_s3(start=harvest)
- report_idstat = huc12pp0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = huc12pp0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="huc12pp0")
- load_release = huc12pp0_naburelease(start=harvest)
- load_uploadrelease = huc12pp0_uploadrelease(start=load_release)
-
- load_prune = huc12pp0_nabu_prune(start=load_uploadrelease)
- load_prov = huc12pp0_nabuprov(start=load_prune)
- load_org = huc12pp0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=huc12pp0_missingreport_graph(start=load_org)
- report_graph=huc12pp0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py
deleted file mode 100644
index c2b1b731..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def huc12pp1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def huc12pp1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_naburelease(context):
- returned_value = gleanerio(context,("release"), "huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp1_uploadrelease(context):
- returned_value = postRelease("huc12pp1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def huc12pp1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def huc12pp1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "huc12pp1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="huc12pp1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="huc12pp1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_huc12pp1():
- containers = huc12pp1_getImage()
- harvest = huc12pp1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = huc12pp1_missingreport_s3(start=harvest)
- report_idstat = huc12pp1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = huc12pp1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="huc12pp1")
- load_release = huc12pp1_naburelease(start=harvest)
- load_uploadrelease = huc12pp1_uploadrelease(start=load_release)
-
- load_prune = huc12pp1_nabu_prune(start=load_uploadrelease)
- load_prov = huc12pp1_nabuprov(start=load_prune)
- load_org = huc12pp1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=huc12pp1_missingreport_graph(start=load_org)
- report_graph=huc12pp1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py
deleted file mode 100644
index 7117cbe5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def hydrologicunit0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_naburelease(context):
- returned_value = gleanerio(context,("release"), "hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_uploadrelease(context):
- returned_value = postRelease("hydrologicunit0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydrologicunit0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydrologicunit0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydrologicunit0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydrologicunit0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def hydrologicunit0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "hydrologicunit0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="hydrologicunit0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="hydrologicunit0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_hydrologicunit0():
- containers = hydrologicunit0_getImage()
- harvest = hydrologicunit0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = hydrologicunit0_missingreport_s3(start=harvest)
- report_idstat = hydrologicunit0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = hydrologicunit0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="hydrologicunit0")
- load_release = hydrologicunit0_naburelease(start=harvest)
- load_uploadrelease = hydrologicunit0_uploadrelease(start=load_release)
-
- load_prune = hydrologicunit0_nabu_prune(start=load_uploadrelease)
- load_prov = hydrologicunit0_nabuprov(start=load_prune)
- load_org = hydrologicunit0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=hydrologicunit0_missingreport_graph(start=load_org)
- report_graph=hydrologicunit0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py
deleted file mode 100644
index 3faabe65..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def links0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def links0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_naburelease(context):
- returned_value = gleanerio(context,("release"), "links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def links0_uploadrelease(context):
- returned_value = postRelease("links0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def links0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "links0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def links0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "links0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def links0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "links0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "links0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def links0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "links0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="links0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="links0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_links0():
- containers = links0_getImage()
- harvest = links0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = links0_missingreport_s3(start=harvest)
- report_idstat = links0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = links0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="links0")
- load_release = links0_naburelease(start=harvest)
- load_uploadrelease = links0_uploadrelease(start=load_release)
-
- load_prune = links0_nabu_prune(start=load_uploadrelease)
- load_prov = links0_nabuprov(start=load_prune)
- load_org = links0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=links0_missingreport_graph(start=load_org)
- report_graph=links0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py
deleted file mode 100644
index cca2df36..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def mainstems0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def mainstems0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_naburelease(context):
- returned_value = gleanerio(context,("release"), "mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def mainstems0_uploadrelease(context):
- returned_value = postRelease("mainstems0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "mainstems0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def mainstems0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "mainstems0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def mainstems0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "mainstems0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "mainstems0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def mainstems0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "mainstems0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="mainstems0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="mainstems0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_mainstems0():
- containers = mainstems0_getImage()
- harvest = mainstems0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = mainstems0_missingreport_s3(start=harvest)
- report_idstat = mainstems0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = mainstems0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="mainstems0")
- load_release = mainstems0_naburelease(start=harvest)
- load_uploadrelease = mainstems0_uploadrelease(start=load_release)
-
- load_prune = mainstems0_nabu_prune(start=load_uploadrelease)
- load_prov = mainstems0_nabuprov(start=load_prune)
- load_org = mainstems0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=mainstems0_missingreport_graph(start=load_org)
- report_graph=mainstems0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py
deleted file mode 100644
index 4eaac896..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nataq0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nataq0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_naburelease(context):
- returned_value = gleanerio(context,("release"), "nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nataq0_uploadrelease(context):
- returned_value = postRelease("nataq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nataq0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nataq0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nataq0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nataq0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nataq0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nataq0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nataq0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nataq0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nataq0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nataq0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nataq0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nataq0():
- containers = nataq0_getImage()
- harvest = nataq0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nataq0_missingreport_s3(start=harvest)
- report_idstat = nataq0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nataq0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nataq0")
- load_release = nataq0_naburelease(start=harvest)
- load_uploadrelease = nataq0_uploadrelease(start=load_release)
-
- load_prune = nataq0_nabu_prune(start=load_uploadrelease)
- load_prov = nataq0_nabuprov(start=load_prune)
- load_org = nataq0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nataq0_missingreport_graph(start=load_org)
- report_graph=nataq0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py
deleted file mode 100644
index 7d309ca8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdiose0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_uploadrelease(context):
- returned_value = postRelease("nmwdiose0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdiose0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdiose0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdiose0():
- containers = nmwdiose0_getImage()
- harvest = nmwdiose0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdiose0_missingreport_s3(start=harvest)
- report_idstat = nmwdiose0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdiose0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdiose0")
- load_release = nmwdiose0_naburelease(start=harvest)
- load_uploadrelease = nmwdiose0_uploadrelease(start=load_release)
-
- load_prune = nmwdiose0_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdiose0_nabuprov(start=load_prune)
- load_org = nmwdiose0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdiose0_missingreport_graph(start=load_org)
- report_graph=nmwdiose0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py
deleted file mode 100644
index 8c16bf16..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdiose1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_uploadrelease(context):
- returned_value = postRelease("nmwdiose1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdiose1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdiose1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdiose1():
- containers = nmwdiose1_getImage()
- harvest = nmwdiose1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdiose1_missingreport_s3(start=harvest)
- report_idstat = nmwdiose1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdiose1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdiose1")
- load_release = nmwdiose1_naburelease(start=harvest)
- load_uploadrelease = nmwdiose1_uploadrelease(start=load_release)
-
- load_prune = nmwdiose1_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdiose1_nabuprov(start=load_prune)
- load_org = nmwdiose1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdiose1_missingreport_graph(start=load_org)
- report_graph=nmwdiose1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py
deleted file mode 100644
index 04b63cab..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdiose2_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_uploadrelease(context):
- returned_value = postRelease("nmwdiose2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose2"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose2"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose2"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose2"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose2_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose2"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdiose2"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdiose2"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdiose2():
- containers = nmwdiose2_getImage()
- harvest = nmwdiose2_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdiose2_missingreport_s3(start=harvest)
- report_idstat = nmwdiose2_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdiose2_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdiose2")
- load_release = nmwdiose2_naburelease(start=harvest)
- load_uploadrelease = nmwdiose2_uploadrelease(start=load_release)
-
- load_prune = nmwdiose2_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdiose2_nabuprov(start=load_prune)
- load_org = nmwdiose2_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdiose2_missingreport_graph(start=load_org)
- report_graph=nmwdiose2_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py
deleted file mode 100644
index 5995e358..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdiose3_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_uploadrelease(context):
- returned_value = postRelease("nmwdiose3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose3"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose3"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose3"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose3"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose3_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose3"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdiose3"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdiose3"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdiose3():
- containers = nmwdiose3_getImage()
- harvest = nmwdiose3_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdiose3_missingreport_s3(start=harvest)
- report_idstat = nmwdiose3_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdiose3_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdiose3")
- load_release = nmwdiose3_naburelease(start=harvest)
- load_uploadrelease = nmwdiose3_uploadrelease(start=load_release)
-
- load_prune = nmwdiose3_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdiose3_nabuprov(start=load_prune)
- load_org = nmwdiose3_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdiose3_missingreport_graph(start=load_org)
- report_graph=nmwdiose3_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py
deleted file mode 100644
index 0e12cc23..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdiose4_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_uploadrelease(context):
- returned_value = postRelease("nmwdiose4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose4"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose4"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose4"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose4"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdiose4_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdiose4"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdiose4"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdiose4"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdiose4():
- containers = nmwdiose4_getImage()
- harvest = nmwdiose4_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdiose4_missingreport_s3(start=harvest)
- report_idstat = nmwdiose4_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdiose4_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdiose4")
- load_release = nmwdiose4_naburelease(start=harvest)
- load_uploadrelease = nmwdiose4_uploadrelease(start=load_release)
-
- load_prune = nmwdiose4_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdiose4_nabuprov(start=load_prune)
- load_org = nmwdiose4_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdiose4_missingreport_graph(start=load_org)
- report_graph=nmwdiose4_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py
deleted file mode 100644
index 63865abd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nmwdist0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nmwdist0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_naburelease(context):
- returned_value = gleanerio(context,("release"), "nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdist0_uploadrelease(context):
- returned_value = postRelease("nmwdist0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdist0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdist0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdist0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nmwdist0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdist0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdist0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nmwdist0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nmwdist0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nmwdist0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nmwdist0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nmwdist0():
- containers = nmwdist0_getImage()
- harvest = nmwdist0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nmwdist0_missingreport_s3(start=harvest)
- report_idstat = nmwdist0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nmwdist0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nmwdist0")
- load_release = nmwdist0_naburelease(start=harvest)
- load_uploadrelease = nmwdist0_uploadrelease(start=load_release)
-
- load_prune = nmwdist0_nabu_prune(start=load_uploadrelease)
- load_prov = nmwdist0_nabuprov(start=load_prune)
- load_org = nmwdist0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nmwdist0_missingreport_graph(start=load_org)
- report_graph=nmwdist0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py
deleted file mode 100644
index 80663d14..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw0_uploadrelease(context):
- returned_value = postRelease("nwisgw0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw0():
- containers = nwisgw0_getImage()
- harvest = nwisgw0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw0_missingreport_s3(start=harvest)
- report_idstat = nwisgw0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw0")
- load_release = nwisgw0_naburelease(start=harvest)
- load_uploadrelease = nwisgw0_uploadrelease(start=load_release)
-
- load_prune = nwisgw0_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw0_nabuprov(start=load_prune)
- load_org = nwisgw0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw0_missingreport_graph(start=load_org)
- report_graph=nwisgw0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py
deleted file mode 100644
index 7fd2ba8d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw1_uploadrelease(context):
- returned_value = postRelease("nwisgw1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw1():
- containers = nwisgw1_getImage()
- harvest = nwisgw1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw1_missingreport_s3(start=harvest)
- report_idstat = nwisgw1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw1")
- load_release = nwisgw1_naburelease(start=harvest)
- load_uploadrelease = nwisgw1_uploadrelease(start=load_release)
-
- load_prune = nwisgw1_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw1_nabuprov(start=load_prune)
- load_org = nwisgw1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw1_missingreport_graph(start=load_org)
- report_graph=nwisgw1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py
deleted file mode 100644
index b1247e3e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw10_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw10_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw10_uploadrelease(context):
- returned_value = postRelease("nwisgw10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw10"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw10_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw10"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw10_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw10"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw10"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw10_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw10"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw10"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw10"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw10():
- containers = nwisgw10_getImage()
- harvest = nwisgw10_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw10_missingreport_s3(start=harvest)
- report_idstat = nwisgw10_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw10_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw10")
- load_release = nwisgw10_naburelease(start=harvest)
- load_uploadrelease = nwisgw10_uploadrelease(start=load_release)
-
- load_prune = nwisgw10_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw10_nabuprov(start=load_prune)
- load_org = nwisgw10_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw10_missingreport_graph(start=load_org)
- report_graph=nwisgw10_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py
deleted file mode 100644
index 8cc75edb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw11_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw11_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw11_uploadrelease(context):
- returned_value = postRelease("nwisgw11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw11"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw11_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw11"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw11_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw11"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw11"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw11_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw11"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw11"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw11"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw11():
- containers = nwisgw11_getImage()
- harvest = nwisgw11_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw11_missingreport_s3(start=harvest)
- report_idstat = nwisgw11_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw11_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw11")
- load_release = nwisgw11_naburelease(start=harvest)
- load_uploadrelease = nwisgw11_uploadrelease(start=load_release)
-
- load_prune = nwisgw11_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw11_nabuprov(start=load_prune)
- load_org = nwisgw11_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw11_missingreport_graph(start=load_org)
- report_graph=nwisgw11_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py
deleted file mode 100644
index c242c2f6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw12_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw12_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw12_uploadrelease(context):
- returned_value = postRelease("nwisgw12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw12"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw12_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw12"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw12_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw12"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw12"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw12_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw12"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw12"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw12"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw12():
- containers = nwisgw12_getImage()
- harvest = nwisgw12_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw12_missingreport_s3(start=harvest)
- report_idstat = nwisgw12_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw12_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw12")
- load_release = nwisgw12_naburelease(start=harvest)
- load_uploadrelease = nwisgw12_uploadrelease(start=load_release)
-
- load_prune = nwisgw12_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw12_nabuprov(start=load_prune)
- load_org = nwisgw12_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw12_missingreport_graph(start=load_org)
- report_graph=nwisgw12_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py
deleted file mode 100644
index 20c3418d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw13_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw13_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw13_uploadrelease(context):
- returned_value = postRelease("nwisgw13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw13"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw13_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw13"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw13_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw13"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw13"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw13_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw13"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw13"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw13"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw13():
- containers = nwisgw13_getImage()
- harvest = nwisgw13_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw13_missingreport_s3(start=harvest)
- report_idstat = nwisgw13_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw13_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw13")
- load_release = nwisgw13_naburelease(start=harvest)
- load_uploadrelease = nwisgw13_uploadrelease(start=load_release)
-
- load_prune = nwisgw13_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw13_nabuprov(start=load_prune)
- load_org = nwisgw13_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw13_missingreport_graph(start=load_org)
- report_graph=nwisgw13_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py
deleted file mode 100644
index ccb6e998..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw14_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw14_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw14_uploadrelease(context):
- returned_value = postRelease("nwisgw14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw14"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw14_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw14"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw14_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw14"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw14"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw14_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw14"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw14"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw14"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw14():
- containers = nwisgw14_getImage()
- harvest = nwisgw14_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw14_missingreport_s3(start=harvest)
- report_idstat = nwisgw14_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw14_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw14")
- load_release = nwisgw14_naburelease(start=harvest)
- load_uploadrelease = nwisgw14_uploadrelease(start=load_release)
-
- load_prune = nwisgw14_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw14_nabuprov(start=load_prune)
- load_org = nwisgw14_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw14_missingreport_graph(start=load_org)
- report_graph=nwisgw14_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py
deleted file mode 100644
index 65e7e3ca..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw15_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw15_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw15_uploadrelease(context):
- returned_value = postRelease("nwisgw15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw15"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw15_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw15"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw15_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw15"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw15"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw15_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw15"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw15"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw15"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw15():
- containers = nwisgw15_getImage()
- harvest = nwisgw15_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw15_missingreport_s3(start=harvest)
- report_idstat = nwisgw15_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw15_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw15")
- load_release = nwisgw15_naburelease(start=harvest)
- load_uploadrelease = nwisgw15_uploadrelease(start=load_release)
-
- load_prune = nwisgw15_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw15_nabuprov(start=load_prune)
- load_org = nwisgw15_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw15_missingreport_graph(start=load_org)
- report_graph=nwisgw15_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py
deleted file mode 100644
index 6b794bbd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw16_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw16_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw16_uploadrelease(context):
- returned_value = postRelease("nwisgw16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw16"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw16_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw16"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw16_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw16"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw16"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw16_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw16"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw16"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw16"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw16():
- containers = nwisgw16_getImage()
- harvest = nwisgw16_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw16_missingreport_s3(start=harvest)
- report_idstat = nwisgw16_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw16_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw16")
- load_release = nwisgw16_naburelease(start=harvest)
- load_uploadrelease = nwisgw16_uploadrelease(start=load_release)
-
- load_prune = nwisgw16_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw16_nabuprov(start=load_prune)
- load_org = nwisgw16_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw16_missingreport_graph(start=load_org)
- report_graph=nwisgw16_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py
deleted file mode 100644
index 059c0d0b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw17_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw17_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw17_uploadrelease(context):
- returned_value = postRelease("nwisgw17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw17"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw17_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw17"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw17_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw17"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw17"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw17_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw17"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw17"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw17"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw17():
- containers = nwisgw17_getImage()
- harvest = nwisgw17_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw17_missingreport_s3(start=harvest)
- report_idstat = nwisgw17_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw17_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw17")
- load_release = nwisgw17_naburelease(start=harvest)
- load_uploadrelease = nwisgw17_uploadrelease(start=load_release)
-
- load_prune = nwisgw17_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw17_nabuprov(start=load_prune)
- load_org = nwisgw17_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw17_missingreport_graph(start=load_org)
- report_graph=nwisgw17_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py
deleted file mode 100644
index f45c6048..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw18_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw18_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw18_uploadrelease(context):
- returned_value = postRelease("nwisgw18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw18"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw18_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw18"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw18_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw18"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw18"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw18_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw18"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw18"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw18"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw18():
- containers = nwisgw18_getImage()
- harvest = nwisgw18_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw18_missingreport_s3(start=harvest)
- report_idstat = nwisgw18_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw18_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw18")
- load_release = nwisgw18_naburelease(start=harvest)
- load_uploadrelease = nwisgw18_uploadrelease(start=load_release)
-
- load_prune = nwisgw18_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw18_nabuprov(start=load_prune)
- load_org = nwisgw18_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw18_missingreport_graph(start=load_org)
- report_graph=nwisgw18_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py
deleted file mode 100644
index 5c2c5e25..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw19_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw19_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw19_uploadrelease(context):
- returned_value = postRelease("nwisgw19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw19"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw19_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw19"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw19_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw19"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw19"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw19_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw19"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw19"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw19"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw19():
- containers = nwisgw19_getImage()
- harvest = nwisgw19_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw19_missingreport_s3(start=harvest)
- report_idstat = nwisgw19_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw19_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw19")
- load_release = nwisgw19_naburelease(start=harvest)
- load_uploadrelease = nwisgw19_uploadrelease(start=load_release)
-
- load_prune = nwisgw19_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw19_nabuprov(start=load_prune)
- load_org = nwisgw19_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw19_missingreport_graph(start=load_org)
- report_graph=nwisgw19_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py
deleted file mode 100644
index 0bb8f1d0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw2_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw2_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw2_uploadrelease(context):
- returned_value = postRelease("nwisgw2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw2"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw2_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw2"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw2_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw2"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw2"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw2_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw2"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw2"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw2"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw2():
- containers = nwisgw2_getImage()
- harvest = nwisgw2_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw2_missingreport_s3(start=harvest)
- report_idstat = nwisgw2_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw2_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw2")
- load_release = nwisgw2_naburelease(start=harvest)
- load_uploadrelease = nwisgw2_uploadrelease(start=load_release)
-
- load_prune = nwisgw2_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw2_nabuprov(start=load_prune)
- load_org = nwisgw2_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw2_missingreport_graph(start=load_org)
- report_graph=nwisgw2_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py
deleted file mode 100644
index e63561fe..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw20_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw20_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw20_uploadrelease(context):
- returned_value = postRelease("nwisgw20")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw20"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw20_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw20"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw20_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw20"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw20"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw20_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw20"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw20"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw20"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw20():
- containers = nwisgw20_getImage()
- harvest = nwisgw20_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw20_missingreport_s3(start=harvest)
- report_idstat = nwisgw20_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw20_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw20")
- load_release = nwisgw20_naburelease(start=harvest)
- load_uploadrelease = nwisgw20_uploadrelease(start=load_release)
-
- load_prune = nwisgw20_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw20_nabuprov(start=load_prune)
- load_org = nwisgw20_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw20_missingreport_graph(start=load_org)
- report_graph=nwisgw20_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py
deleted file mode 100644
index 3067b9df..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw21_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw21_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw21_uploadrelease(context):
- returned_value = postRelease("nwisgw21")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw21"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw21_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw21"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw21_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw21"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw21"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw21_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw21"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw21"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw21"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw21():
- containers = nwisgw21_getImage()
- harvest = nwisgw21_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw21_missingreport_s3(start=harvest)
- report_idstat = nwisgw21_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw21_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw21")
- load_release = nwisgw21_naburelease(start=harvest)
- load_uploadrelease = nwisgw21_uploadrelease(start=load_release)
-
- load_prune = nwisgw21_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw21_nabuprov(start=load_prune)
- load_org = nwisgw21_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw21_missingreport_graph(start=load_org)
- report_graph=nwisgw21_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py
deleted file mode 100644
index 2a868789..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw22_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw22_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw22_uploadrelease(context):
- returned_value = postRelease("nwisgw22")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw22"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw22_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw22"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw22_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw22"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw22"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw22_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw22"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw22"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw22"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw22():
- containers = nwisgw22_getImage()
- harvest = nwisgw22_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw22_missingreport_s3(start=harvest)
- report_idstat = nwisgw22_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw22_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw22")
- load_release = nwisgw22_naburelease(start=harvest)
- load_uploadrelease = nwisgw22_uploadrelease(start=load_release)
-
- load_prune = nwisgw22_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw22_nabuprov(start=load_prune)
- load_org = nwisgw22_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw22_missingreport_graph(start=load_org)
- report_graph=nwisgw22_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py
deleted file mode 100644
index 38b2304b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw23_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw23_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw23_uploadrelease(context):
- returned_value = postRelease("nwisgw23")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw23"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw23_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw23"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw23_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw23"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw23"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw23_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw23"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw23"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw23"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw23():
- containers = nwisgw23_getImage()
- harvest = nwisgw23_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw23_missingreport_s3(start=harvest)
- report_idstat = nwisgw23_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw23_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw23")
- load_release = nwisgw23_naburelease(start=harvest)
- load_uploadrelease = nwisgw23_uploadrelease(start=load_release)
-
- load_prune = nwisgw23_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw23_nabuprov(start=load_prune)
- load_org = nwisgw23_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw23_missingreport_graph(start=load_org)
- report_graph=nwisgw23_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py
deleted file mode 100644
index f6728464..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw24_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw24_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw24_uploadrelease(context):
- returned_value = postRelease("nwisgw24")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw24"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw24_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw24"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw24_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw24"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw24"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw24_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw24"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw24"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw24"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw24():
- containers = nwisgw24_getImage()
- harvest = nwisgw24_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw24_missingreport_s3(start=harvest)
- report_idstat = nwisgw24_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw24_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw24")
- load_release = nwisgw24_naburelease(start=harvest)
- load_uploadrelease = nwisgw24_uploadrelease(start=load_release)
-
- load_prune = nwisgw24_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw24_nabuprov(start=load_prune)
- load_org = nwisgw24_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw24_missingreport_graph(start=load_org)
- report_graph=nwisgw24_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py
deleted file mode 100644
index 3e7f9f02..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw25_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw25_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw25_uploadrelease(context):
- returned_value = postRelease("nwisgw25")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw25"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw25_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw25"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw25_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw25"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw25"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw25_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw25"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw25"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw25"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw25():
- containers = nwisgw25_getImage()
- harvest = nwisgw25_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw25_missingreport_s3(start=harvest)
- report_idstat = nwisgw25_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw25_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw25")
- load_release = nwisgw25_naburelease(start=harvest)
- load_uploadrelease = nwisgw25_uploadrelease(start=load_release)
-
- load_prune = nwisgw25_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw25_nabuprov(start=load_prune)
- load_org = nwisgw25_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw25_missingreport_graph(start=load_org)
- report_graph=nwisgw25_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py
deleted file mode 100644
index 8cdab085..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw26_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw26_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw26_uploadrelease(context):
- returned_value = postRelease("nwisgw26")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw26"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw26_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw26"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw26_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw26"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw26"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw26_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw26"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw26"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw26"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw26():
- containers = nwisgw26_getImage()
- harvest = nwisgw26_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw26_missingreport_s3(start=harvest)
- report_idstat = nwisgw26_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw26_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw26")
- load_release = nwisgw26_naburelease(start=harvest)
- load_uploadrelease = nwisgw26_uploadrelease(start=load_release)
-
- load_prune = nwisgw26_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw26_nabuprov(start=load_prune)
- load_org = nwisgw26_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw26_missingreport_graph(start=load_org)
- report_graph=nwisgw26_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py
deleted file mode 100644
index c4069a97..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw27_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw27_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw27_uploadrelease(context):
- returned_value = postRelease("nwisgw27")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw27"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw27_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw27"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw27_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw27"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw27"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw27_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw27"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw27"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw27"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw27():
- containers = nwisgw27_getImage()
- harvest = nwisgw27_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw27_missingreport_s3(start=harvest)
- report_idstat = nwisgw27_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw27_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw27")
- load_release = nwisgw27_naburelease(start=harvest)
- load_uploadrelease = nwisgw27_uploadrelease(start=load_release)
-
- load_prune = nwisgw27_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw27_nabuprov(start=load_prune)
- load_org = nwisgw27_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw27_missingreport_graph(start=load_org)
- report_graph=nwisgw27_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py
deleted file mode 100644
index 3c5035c4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw28_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw28_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw28_uploadrelease(context):
- returned_value = postRelease("nwisgw28")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw28"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw28_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw28"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw28_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw28"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw28"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw28_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw28"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw28"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw28"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw28():
- containers = nwisgw28_getImage()
- harvest = nwisgw28_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw28_missingreport_s3(start=harvest)
- report_idstat = nwisgw28_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw28_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw28")
- load_release = nwisgw28_naburelease(start=harvest)
- load_uploadrelease = nwisgw28_uploadrelease(start=load_release)
-
- load_prune = nwisgw28_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw28_nabuprov(start=load_prune)
- load_org = nwisgw28_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw28_missingreport_graph(start=load_org)
- report_graph=nwisgw28_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py
deleted file mode 100644
index 2974d5e9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw3_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw3_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw3_uploadrelease(context):
- returned_value = postRelease("nwisgw3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw3"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw3_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw3"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw3_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw3"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw3"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw3_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw3"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw3"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw3"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw3():
- containers = nwisgw3_getImage()
- harvest = nwisgw3_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw3_missingreport_s3(start=harvest)
- report_idstat = nwisgw3_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw3_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw3")
- load_release = nwisgw3_naburelease(start=harvest)
- load_uploadrelease = nwisgw3_uploadrelease(start=load_release)
-
- load_prune = nwisgw3_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw3_nabuprov(start=load_prune)
- load_org = nwisgw3_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw3_missingreport_graph(start=load_org)
- report_graph=nwisgw3_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py
deleted file mode 100644
index fd1fbca9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw4_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw4_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw4_uploadrelease(context):
- returned_value = postRelease("nwisgw4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw4"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw4_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw4"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw4_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw4"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw4"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw4_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw4"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw4"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw4"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw4():
- containers = nwisgw4_getImage()
- harvest = nwisgw4_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw4_missingreport_s3(start=harvest)
- report_idstat = nwisgw4_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw4_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw4")
- load_release = nwisgw4_naburelease(start=harvest)
- load_uploadrelease = nwisgw4_uploadrelease(start=load_release)
-
- load_prune = nwisgw4_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw4_nabuprov(start=load_prune)
- load_org = nwisgw4_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw4_missingreport_graph(start=load_org)
- report_graph=nwisgw4_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py
deleted file mode 100644
index 0c59a791..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw5_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw5_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw5_uploadrelease(context):
- returned_value = postRelease("nwisgw5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw5"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw5_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw5"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw5_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw5"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw5"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw5_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw5"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw5"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw5"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw5():
- containers = nwisgw5_getImage()
- harvest = nwisgw5_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw5_missingreport_s3(start=harvest)
- report_idstat = nwisgw5_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw5_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw5")
- load_release = nwisgw5_naburelease(start=harvest)
- load_uploadrelease = nwisgw5_uploadrelease(start=load_release)
-
- load_prune = nwisgw5_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw5_nabuprov(start=load_prune)
- load_org = nwisgw5_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw5_missingreport_graph(start=load_org)
- report_graph=nwisgw5_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py
deleted file mode 100644
index 15ce0c7f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw6_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw6_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw6_uploadrelease(context):
- returned_value = postRelease("nwisgw6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw6"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw6_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw6"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw6_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw6"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw6"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw6_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw6"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw6"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw6"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw6():
- containers = nwisgw6_getImage()
- harvest = nwisgw6_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw6_missingreport_s3(start=harvest)
- report_idstat = nwisgw6_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw6_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw6")
- load_release = nwisgw6_naburelease(start=harvest)
- load_uploadrelease = nwisgw6_uploadrelease(start=load_release)
-
- load_prune = nwisgw6_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw6_nabuprov(start=load_prune)
- load_org = nwisgw6_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw6_missingreport_graph(start=load_org)
- report_graph=nwisgw6_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py
deleted file mode 100644
index 50ca251f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw7_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw7_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw7_uploadrelease(context):
- returned_value = postRelease("nwisgw7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw7"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw7_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw7"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw7_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw7"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw7"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw7_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw7"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw7"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw7"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw7():
- containers = nwisgw7_getImage()
- harvest = nwisgw7_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw7_missingreport_s3(start=harvest)
- report_idstat = nwisgw7_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw7_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw7")
- load_release = nwisgw7_naburelease(start=harvest)
- load_uploadrelease = nwisgw7_uploadrelease(start=load_release)
-
- load_prune = nwisgw7_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw7_nabuprov(start=load_prune)
- load_org = nwisgw7_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw7_missingreport_graph(start=load_org)
- report_graph=nwisgw7_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py
deleted file mode 100644
index 1bed0045..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw8_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw8_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw8_uploadrelease(context):
- returned_value = postRelease("nwisgw8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw8"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw8_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw8"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw8_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw8"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw8"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw8_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw8"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw8"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw8"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw8():
- containers = nwisgw8_getImage()
- harvest = nwisgw8_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw8_missingreport_s3(start=harvest)
- report_idstat = nwisgw8_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw8_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw8")
- load_release = nwisgw8_naburelease(start=harvest)
- load_uploadrelease = nwisgw8_uploadrelease(start=load_release)
-
- load_prune = nwisgw8_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw8_nabuprov(start=load_prune)
- load_org = nwisgw8_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw8_missingreport_graph(start=load_org)
- report_graph=nwisgw8_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py
deleted file mode 100644
index c7c07d5c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwisgw9_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwisgw9_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw9_uploadrelease(context):
- returned_value = postRelease("nwisgw9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw9"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw9_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw9"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwisgw9_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw9"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw9"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwisgw9_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwisgw9"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwisgw9"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwisgw9"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwisgw9():
- containers = nwisgw9_getImage()
- harvest = nwisgw9_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwisgw9_missingreport_s3(start=harvest)
- report_idstat = nwisgw9_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwisgw9_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwisgw9")
- load_release = nwisgw9_naburelease(start=harvest)
- load_uploadrelease = nwisgw9_uploadrelease(start=load_release)
-
- load_prune = nwisgw9_nabu_prune(start=load_uploadrelease)
- load_prov = nwisgw9_nabuprov(start=load_prune)
- load_org = nwisgw9_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwisgw9_missingreport_graph(start=load_org)
- report_graph=nwisgw9_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py
deleted file mode 100644
index 84bea693..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwissite0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwissite0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite0_uploadrelease(context):
- returned_value = postRelease("nwissite0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwissite0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwissite0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwissite0():
- containers = nwissite0_getImage()
- harvest = nwissite0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwissite0_missingreport_s3(start=harvest)
- report_idstat = nwissite0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwissite0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwissite0")
- load_release = nwissite0_naburelease(start=harvest)
- load_uploadrelease = nwissite0_uploadrelease(start=load_release)
-
- load_prune = nwissite0_nabu_prune(start=load_uploadrelease)
- load_prov = nwissite0_nabuprov(start=load_prune)
- load_org = nwissite0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwissite0_missingreport_graph(start=load_org)
- report_graph=nwissite0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py
deleted file mode 100644
index 2fccc6a4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwissite1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwissite1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite1_uploadrelease(context):
- returned_value = postRelease("nwissite1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwissite1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwissite1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwissite1():
- containers = nwissite1_getImage()
- harvest = nwissite1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwissite1_missingreport_s3(start=harvest)
- report_idstat = nwissite1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwissite1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwissite1")
- load_release = nwissite1_naburelease(start=harvest)
- load_uploadrelease = nwissite1_uploadrelease(start=load_release)
-
- load_prune = nwissite1_nabu_prune(start=load_uploadrelease)
- load_prov = nwissite1_nabuprov(start=load_prune)
- load_org = nwissite1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwissite1_missingreport_graph(start=load_org)
- report_graph=nwissite1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py
deleted file mode 100644
index 29c8c5b4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwissite2_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwissite2_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite2_uploadrelease(context):
- returned_value = postRelease("nwissite2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite2"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite2_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite2"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite2_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite2"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite2"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite2_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite2"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwissite2"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwissite2"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwissite2():
- containers = nwissite2_getImage()
- harvest = nwissite2_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwissite2_missingreport_s3(start=harvest)
- report_idstat = nwissite2_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwissite2_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwissite2")
- load_release = nwissite2_naburelease(start=harvest)
- load_uploadrelease = nwissite2_uploadrelease(start=load_release)
-
- load_prune = nwissite2_nabu_prune(start=load_uploadrelease)
- load_prov = nwissite2_nabuprov(start=load_prune)
- load_org = nwissite2_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwissite2_missingreport_graph(start=load_org)
- report_graph=nwissite2_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py
deleted file mode 100644
index bca31ef4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def nwissite3_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def nwissite3_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_naburelease(context):
- returned_value = gleanerio(context,("release"), "nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite3_uploadrelease(context):
- returned_value = postRelease("nwissite3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite3"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite3_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite3"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def nwissite3_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite3"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite3"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def nwissite3_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "nwissite3"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="nwissite3"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="nwissite3"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_nwissite3():
- containers = nwissite3_getImage()
- harvest = nwissite3_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = nwissite3_missingreport_s3(start=harvest)
- report_idstat = nwissite3_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = nwissite3_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="nwissite3")
- load_release = nwissite3_naburelease(start=harvest)
- load_uploadrelease = nwissite3_uploadrelease(start=load_release)
-
- load_prune = nwissite3_nabu_prune(start=load_uploadrelease)
- load_prov = nwissite3_nabuprov(start=load_prune)
- load_org = nwissite3_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=nwissite3_missingreport_graph(start=load_org)
- report_graph=nwissite3_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py
deleted file mode 100644
index ffc44452..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def places0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def places0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_naburelease(context):
- returned_value = gleanerio(context,("release"), "places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def places0_uploadrelease(context):
- returned_value = postRelease("places0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def places0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "places0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def places0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "places0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def places0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "places0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "places0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def places0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "places0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="places0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="places0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_places0():
- containers = places0_getImage()
- harvest = places0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = places0_missingreport_s3(start=harvest)
- report_idstat = places0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = places0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="places0")
- load_release = places0_naburelease(start=harvest)
- load_uploadrelease = places0_uploadrelease(start=load_release)
-
- load_prune = places0_nabu_prune(start=load_uploadrelease)
- load_prov = places0_nabuprov(start=load_prune)
- load_org = places0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=places0_missingreport_graph(start=load_org)
- report_graph=places0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py
deleted file mode 100644
index 0f9a8663..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def princiaq0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def princiaq0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_naburelease(context):
- returned_value = gleanerio(context,("release"), "princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def princiaq0_uploadrelease(context):
- returned_value = postRelease("princiaq0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "princiaq0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def princiaq0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "princiaq0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def princiaq0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "princiaq0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "princiaq0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def princiaq0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "princiaq0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="princiaq0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="princiaq0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_princiaq0():
- containers = princiaq0_getImage()
- harvest = princiaq0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = princiaq0_missingreport_s3(start=harvest)
- report_idstat = princiaq0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = princiaq0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="princiaq0")
- load_release = princiaq0_naburelease(start=harvest)
- load_uploadrelease = princiaq0_uploadrelease(start=load_release)
-
- load_prune = princiaq0_nabu_prune(start=load_uploadrelease)
- load_prov = princiaq0_nabuprov(start=load_prune)
- load_org = princiaq0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=princiaq0_missingreport_graph(start=load_org)
- report_graph=princiaq0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py
deleted file mode 100644
index 673d3d59..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def pws0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def pws0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_naburelease(context):
- returned_value = gleanerio(context,("release"), "pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def pws0_uploadrelease(context):
- returned_value = postRelease("pws0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def pws0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "pws0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def pws0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "pws0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def pws0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "pws0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "pws0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def pws0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "pws0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="pws0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="pws0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_pws0():
- containers = pws0_getImage()
- harvest = pws0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = pws0_missingreport_s3(start=harvest)
- report_idstat = pws0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = pws0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="pws0")
- load_release = pws0_naburelease(start=harvest)
- load_uploadrelease = pws0_uploadrelease(start=load_release)
-
- load_prune = pws0_nabu_prune(start=load_uploadrelease)
- load_prov = pws0_nabuprov(start=load_prune)
- load_org = pws0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=pws0_missingreport_graph(start=load_org)
- report_graph=pws0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py
deleted file mode 100644
index 99eb59f6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def refgage0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def refgage0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_naburelease(context):
- returned_value = gleanerio(context,("release"), "refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage0_uploadrelease(context):
- returned_value = postRelease("refgage0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def refgage0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="refgage0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="refgage0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_refgage0():
- containers = refgage0_getImage()
- harvest = refgage0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = refgage0_missingreport_s3(start=harvest)
- report_idstat = refgage0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = refgage0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="refgage0")
- load_release = refgage0_naburelease(start=harvest)
- load_uploadrelease = refgage0_uploadrelease(start=load_release)
-
- load_prune = refgage0_nabu_prune(start=load_uploadrelease)
- load_prov = refgage0_nabuprov(start=load_prune)
- load_org = refgage0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=refgage0_missingreport_graph(start=load_org)
- report_graph=refgage0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py
deleted file mode 100644
index 00d8d18e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def refgage1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def refgage1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_naburelease(context):
- returned_value = gleanerio(context,("release"), "refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage1_uploadrelease(context):
- returned_value = postRelease("refgage1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def refgage1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="refgage1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="refgage1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_refgage1():
- containers = refgage1_getImage()
- harvest = refgage1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = refgage1_missingreport_s3(start=harvest)
- report_idstat = refgage1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = refgage1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="refgage1")
- load_release = refgage1_naburelease(start=harvest)
- load_uploadrelease = refgage1_uploadrelease(start=load_release)
-
- load_prune = refgage1_nabu_prune(start=load_uploadrelease)
- load_prov = refgage1_nabuprov(start=load_prune)
- load_org = refgage1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=refgage1_missingreport_graph(start=load_org)
- report_graph=refgage1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py
deleted file mode 100644
index c2c46b6c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def refgage2_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def refgage2_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_naburelease(context):
- returned_value = gleanerio(context,("release"), "refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage2_uploadrelease(context):
- returned_value = postRelease("refgage2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def refgage2_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage2"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage2_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage2"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage2_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage2"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage2"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage2_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage2"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="refgage2"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="refgage2"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_refgage2():
- containers = refgage2_getImage()
- harvest = refgage2_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = refgage2_missingreport_s3(start=harvest)
- report_idstat = refgage2_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = refgage2_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="refgage2")
- load_release = refgage2_naburelease(start=harvest)
- load_uploadrelease = refgage2_uploadrelease(start=load_release)
-
- load_prune = refgage2_nabu_prune(start=load_uploadrelease)
- load_prov = refgage2_nabuprov(start=load_prune)
- load_org = refgage2_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=refgage2_missingreport_graph(start=load_org)
- report_graph=refgage2_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py
deleted file mode 100644
index e67f7b4d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def refgage3_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def refgage3_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_naburelease(context):
- returned_value = gleanerio(context,("release"), "refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage3_uploadrelease(context):
- returned_value = postRelease("refgage3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def refgage3_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage3"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage3_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage3"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def refgage3_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage3"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage3"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def refgage3_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "refgage3"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="refgage3"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="refgage3"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_refgage3():
- containers = refgage3_getImage()
- harvest = refgage3_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = refgage3_missingreport_s3(start=harvest)
- report_idstat = refgage3_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = refgage3_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="refgage3")
- load_release = refgage3_naburelease(start=harvest)
- load_uploadrelease = refgage3_uploadrelease(start=load_release)
-
- load_prune = refgage3_nabu_prune(start=load_uploadrelease)
- load_prov = refgage3_nabuprov(start=load_prune)
- load_org = refgage3_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=refgage3_missingreport_graph(start=load_org)
- report_graph=refgage3_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py
deleted file mode 100644
index c6e69dc7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def rise0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def rise0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_naburelease(context):
- returned_value = gleanerio(context,("release"), "rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def rise0_uploadrelease(context):
- returned_value = postRelease("rise0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def rise0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "rise0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def rise0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "rise0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def rise0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "rise0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "rise0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def rise0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "rise0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="rise0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="rise0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_rise0():
- containers = rise0_getImage()
- harvest = rise0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = rise0_missingreport_s3(start=harvest)
- report_idstat = rise0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = rise0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="rise0")
- load_release = rise0_naburelease(start=harvest)
- load_uploadrelease = rise0_uploadrelease(start=load_release)
-
- load_prune = rise0_nabu_prune(start=load_uploadrelease)
- load_prov = rise0_nabuprov(start=load_prune)
- load_org = rise0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=rise0_missingreport_graph(start=load_org)
- report_graph=rise0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py
deleted file mode 100644
index a1148b8f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def sechydrgreg0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_naburelease(context):
- returned_value = gleanerio(context,("release"), "sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_uploadrelease(context):
- returned_value = postRelease("sechydrgreg0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "sechydrgreg0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "sechydrgreg0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "sechydrgreg0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "sechydrgreg0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def sechydrgreg0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "sechydrgreg0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="sechydrgreg0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="sechydrgreg0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_sechydrgreg0():
- containers = sechydrgreg0_getImage()
- harvest = sechydrgreg0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = sechydrgreg0_missingreport_s3(start=harvest)
- report_idstat = sechydrgreg0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = sechydrgreg0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="sechydrgreg0")
- load_release = sechydrgreg0_naburelease(start=harvest)
- load_uploadrelease = sechydrgreg0_uploadrelease(start=load_release)
-
- load_prune = sechydrgreg0_nabu_prune(start=load_uploadrelease)
- load_prov = sechydrgreg0_nabuprov(start=load_prune)
- load_org = sechydrgreg0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=sechydrgreg0_missingreport_graph(start=load_org)
- report_graph=sechydrgreg0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py
deleted file mode 100644
index ea9c7fc3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def selfieids0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def selfieids0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_naburelease(context):
- returned_value = gleanerio(context,("release"), "selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def selfieids0_uploadrelease(context):
- returned_value = postRelease("selfieids0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "selfieids0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def selfieids0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "selfieids0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def selfieids0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "selfieids0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "selfieids0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def selfieids0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "selfieids0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="selfieids0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="selfieids0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_selfieids0():
- containers = selfieids0_getImage()
- harvest = selfieids0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = selfieids0_missingreport_s3(start=harvest)
- report_idstat = selfieids0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = selfieids0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="selfieids0")
- load_release = selfieids0_naburelease(start=harvest)
- load_uploadrelease = selfieids0_uploadrelease(start=load_release)
-
- load_prune = selfieids0_nabu_prune(start=load_uploadrelease)
- load_prov = selfieids0_nabuprov(start=load_prune)
- load_org = selfieids0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=selfieids0_missingreport_graph(start=load_org)
- report_graph=selfieids0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py
deleted file mode 100644
index aeff1252..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def states0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def states0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_naburelease(context):
- returned_value = gleanerio(context,("release"), "states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def states0_uploadrelease(context):
- returned_value = postRelease("states0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def states0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "states0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def states0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "states0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def states0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "states0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "states0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def states0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "states0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="states0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="states0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_states0():
- containers = states0_getImage()
- harvest = states0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = states0_missingreport_s3(start=harvest)
- report_idstat = states0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = states0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="states0")
- load_release = states0_naburelease(start=harvest)
- load_uploadrelease = states0_uploadrelease(start=load_release)
-
- load_prune = states0_nabu_prune(start=load_uploadrelease)
- load_prov = states0_nabuprov(start=load_prune)
- load_org = states0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=states0_missingreport_graph(start=load_org)
- report_graph=states0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py
deleted file mode 100644
index 160d7c96..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def ua100_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def ua100_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_naburelease(context):
- returned_value = gleanerio(context,("release"), "ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ua100_uploadrelease(context):
- returned_value = postRelease("ua100")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def ua100_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ua100"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ua100_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ua100"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def ua100_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ua100"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ua100"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def ua100_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ua100"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="ua100"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="ua100"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_ua100():
- containers = ua100_getImage()
- harvest = ua100_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = ua100_missingreport_s3(start=harvest)
- report_idstat = ua100_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = ua100_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="ua100")
- load_release = ua100_naburelease(start=harvest)
- load_uploadrelease = ua100_uploadrelease(start=load_release)
-
- load_prune = ua100_nabu_prune(start=load_uploadrelease)
- load_prov = ua100_nabuprov(start=load_prune)
- load_org = ua100_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=ua100_missingreport_graph(start=load_org)
- report_graph=ua100_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py
deleted file mode 100644
index 3015cf37..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade0_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade0_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade0_uploadrelease(context):
- returned_value = postRelease("wade0")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade0_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade0"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade0_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade0"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade0_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade0"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade0"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade0_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade0"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade0"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade0"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade0():
- containers = wade0_getImage()
- harvest = wade0_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade0_missingreport_s3(start=harvest)
- report_idstat = wade0_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade0_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade0")
- load_release = wade0_naburelease(start=harvest)
- load_uploadrelease = wade0_uploadrelease(start=load_release)
-
- load_prune = wade0_nabu_prune(start=load_uploadrelease)
- load_prov = wade0_nabuprov(start=load_prune)
- load_org = wade0_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade0_missingreport_graph(start=load_org)
- report_graph=wade0_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py
deleted file mode 100644
index 52979dd5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade1_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade1_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade1_uploadrelease(context):
- returned_value = postRelease("wade1")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade1_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade1"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade1_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade1"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade1_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade1"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade1"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade1_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade1"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade1"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade1"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade1():
- containers = wade1_getImage()
- harvest = wade1_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade1_missingreport_s3(start=harvest)
- report_idstat = wade1_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade1_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade1")
- load_release = wade1_naburelease(start=harvest)
- load_uploadrelease = wade1_uploadrelease(start=load_release)
-
- load_prune = wade1_nabu_prune(start=load_uploadrelease)
- load_prov = wade1_nabuprov(start=load_prune)
- load_org = wade1_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade1_missingreport_graph(start=load_org)
- report_graph=wade1_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py
deleted file mode 100644
index eecc625a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade10_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade10_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade10_uploadrelease(context):
- returned_value = postRelease("wade10")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade10_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade10"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade10_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade10"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade10_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade10"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade10"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade10_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade10"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade10"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade10"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade10():
- containers = wade10_getImage()
- harvest = wade10_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade10_missingreport_s3(start=harvest)
- report_idstat = wade10_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade10_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade10")
- load_release = wade10_naburelease(start=harvest)
- load_uploadrelease = wade10_uploadrelease(start=load_release)
-
- load_prune = wade10_nabu_prune(start=load_uploadrelease)
- load_prov = wade10_nabuprov(start=load_prune)
- load_org = wade10_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade10_missingreport_graph(start=load_org)
- report_graph=wade10_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py
deleted file mode 100644
index 915ab299..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade11_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade11_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade11_uploadrelease(context):
- returned_value = postRelease("wade11")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade11_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade11"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade11_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade11"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade11_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade11"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade11"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade11_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade11"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade11"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade11"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade11():
- containers = wade11_getImage()
- harvest = wade11_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade11_missingreport_s3(start=harvest)
- report_idstat = wade11_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade11_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade11")
- load_release = wade11_naburelease(start=harvest)
- load_uploadrelease = wade11_uploadrelease(start=load_release)
-
- load_prune = wade11_nabu_prune(start=load_uploadrelease)
- load_prov = wade11_nabuprov(start=load_prune)
- load_org = wade11_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade11_missingreport_graph(start=load_org)
- report_graph=wade11_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py
deleted file mode 100644
index d267c955..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade12_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade12_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade12_uploadrelease(context):
- returned_value = postRelease("wade12")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade12_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade12"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade12_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade12"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade12_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade12"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade12"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade12_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade12"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade12"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade12"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade12():
- containers = wade12_getImage()
- harvest = wade12_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade12_missingreport_s3(start=harvest)
- report_idstat = wade12_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade12_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade12")
- load_release = wade12_naburelease(start=harvest)
- load_uploadrelease = wade12_uploadrelease(start=load_release)
-
- load_prune = wade12_nabu_prune(start=load_uploadrelease)
- load_prov = wade12_nabuprov(start=load_prune)
- load_org = wade12_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade12_missingreport_graph(start=load_org)
- report_graph=wade12_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py
deleted file mode 100644
index 1b0701bd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade13_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade13_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade13_uploadrelease(context):
- returned_value = postRelease("wade13")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade13_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade13"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade13_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade13"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade13_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade13"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade13"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade13_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade13"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade13"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade13"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade13():
- containers = wade13_getImage()
- harvest = wade13_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade13_missingreport_s3(start=harvest)
- report_idstat = wade13_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade13_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade13")
- load_release = wade13_naburelease(start=harvest)
- load_uploadrelease = wade13_uploadrelease(start=load_release)
-
- load_prune = wade13_nabu_prune(start=load_uploadrelease)
- load_prov = wade13_nabuprov(start=load_prune)
- load_org = wade13_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade13_missingreport_graph(start=load_org)
- report_graph=wade13_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py
deleted file mode 100644
index 65d7d3ad..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade14_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade14_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade14_uploadrelease(context):
- returned_value = postRelease("wade14")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade14_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade14"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade14_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade14"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade14_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade14"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade14"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade14_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade14"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade14"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade14"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade14():
- containers = wade14_getImage()
- harvest = wade14_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade14_missingreport_s3(start=harvest)
- report_idstat = wade14_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade14_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade14")
- load_release = wade14_naburelease(start=harvest)
- load_uploadrelease = wade14_uploadrelease(start=load_release)
-
- load_prune = wade14_nabu_prune(start=load_uploadrelease)
- load_prov = wade14_nabuprov(start=load_prune)
- load_org = wade14_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade14_missingreport_graph(start=load_org)
- report_graph=wade14_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py
deleted file mode 100644
index 97cee9b6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade15_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade15_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade15_uploadrelease(context):
- returned_value = postRelease("wade15")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade15_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade15"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade15_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade15"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade15_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade15"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade15"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade15_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade15"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade15"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade15"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade15():
- containers = wade15_getImage()
- harvest = wade15_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade15_missingreport_s3(start=harvest)
- report_idstat = wade15_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade15_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade15")
- load_release = wade15_naburelease(start=harvest)
- load_uploadrelease = wade15_uploadrelease(start=load_release)
-
- load_prune = wade15_nabu_prune(start=load_uploadrelease)
- load_prov = wade15_nabuprov(start=load_prune)
- load_org = wade15_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade15_missingreport_graph(start=load_org)
- report_graph=wade15_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py
deleted file mode 100644
index 6951b8ab..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade16_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade16_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade16_uploadrelease(context):
- returned_value = postRelease("wade16")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade16_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade16"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade16_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade16"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade16_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade16"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade16"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade16_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade16"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade16"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade16"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade16():
- containers = wade16_getImage()
- harvest = wade16_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade16_missingreport_s3(start=harvest)
- report_idstat = wade16_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade16_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade16")
- load_release = wade16_naburelease(start=harvest)
- load_uploadrelease = wade16_uploadrelease(start=load_release)
-
- load_prune = wade16_nabu_prune(start=load_uploadrelease)
- load_prov = wade16_nabuprov(start=load_prune)
- load_org = wade16_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade16_missingreport_graph(start=load_org)
- report_graph=wade16_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py
deleted file mode 100644
index 8ce081ef..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade17_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade17_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade17_uploadrelease(context):
- returned_value = postRelease("wade17")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade17_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade17"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade17_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade17"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade17_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade17"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade17"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade17_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade17"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade17"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade17"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade17():
- containers = wade17_getImage()
- harvest = wade17_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade17_missingreport_s3(start=harvest)
- report_idstat = wade17_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade17_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade17")
- load_release = wade17_naburelease(start=harvest)
- load_uploadrelease = wade17_uploadrelease(start=load_release)
-
- load_prune = wade17_nabu_prune(start=load_uploadrelease)
- load_prov = wade17_nabuprov(start=load_prune)
- load_org = wade17_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade17_missingreport_graph(start=load_org)
- report_graph=wade17_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py
deleted file mode 100644
index 0bdeeca0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade18_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade18_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade18_uploadrelease(context):
- returned_value = postRelease("wade18")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade18_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade18"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade18_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade18"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade18_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade18"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade18"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade18_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade18"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade18"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade18"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade18():
- containers = wade18_getImage()
- harvest = wade18_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade18_missingreport_s3(start=harvest)
- report_idstat = wade18_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade18_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade18")
- load_release = wade18_naburelease(start=harvest)
- load_uploadrelease = wade18_uploadrelease(start=load_release)
-
- load_prune = wade18_nabu_prune(start=load_uploadrelease)
- load_prov = wade18_nabuprov(start=load_prune)
- load_org = wade18_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade18_missingreport_graph(start=load_org)
- report_graph=wade18_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py
deleted file mode 100644
index 96d8f2e6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade19_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade19_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade19_uploadrelease(context):
- returned_value = postRelease("wade19")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade19_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade19"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade19_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade19"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade19_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade19"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade19"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade19_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade19"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade19"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade19"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade19():
- containers = wade19_getImage()
- harvest = wade19_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade19_missingreport_s3(start=harvest)
- report_idstat = wade19_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade19_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade19")
- load_release = wade19_naburelease(start=harvest)
- load_uploadrelease = wade19_uploadrelease(start=load_release)
-
- load_prune = wade19_nabu_prune(start=load_uploadrelease)
- load_prov = wade19_nabuprov(start=load_prune)
- load_org = wade19_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade19_missingreport_graph(start=load_org)
- report_graph=wade19_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py
deleted file mode 100644
index d231a935..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade2_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade2_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade2_uploadrelease(context):
- returned_value = postRelease("wade2")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade2_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade2"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade2_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade2"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade2_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade2"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade2"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade2_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade2"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade2"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade2"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade2():
- containers = wade2_getImage()
- harvest = wade2_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade2_missingreport_s3(start=harvest)
- report_idstat = wade2_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade2_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade2")
- load_release = wade2_naburelease(start=harvest)
- load_uploadrelease = wade2_uploadrelease(start=load_release)
-
- load_prune = wade2_nabu_prune(start=load_uploadrelease)
- load_prov = wade2_nabuprov(start=load_prune)
- load_org = wade2_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade2_missingreport_graph(start=load_org)
- report_graph=wade2_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py
deleted file mode 100644
index 60f2348a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade3_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade3_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade3_uploadrelease(context):
- returned_value = postRelease("wade3")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade3_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade3"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade3_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade3"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade3_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade3"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade3"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade3_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade3"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade3"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade3"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade3():
- containers = wade3_getImage()
- harvest = wade3_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade3_missingreport_s3(start=harvest)
- report_idstat = wade3_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade3_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade3")
- load_release = wade3_naburelease(start=harvest)
- load_uploadrelease = wade3_uploadrelease(start=load_release)
-
- load_prune = wade3_nabu_prune(start=load_uploadrelease)
- load_prov = wade3_nabuprov(start=load_prune)
- load_org = wade3_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade3_missingreport_graph(start=load_org)
- report_graph=wade3_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py
deleted file mode 100644
index 15dd87e8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade4_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade4_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade4_uploadrelease(context):
- returned_value = postRelease("wade4")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade4_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade4"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade4_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade4"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade4_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade4"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade4"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade4_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade4"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade4"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade4"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade4():
- containers = wade4_getImage()
- harvest = wade4_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade4_missingreport_s3(start=harvest)
- report_idstat = wade4_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade4_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade4")
- load_release = wade4_naburelease(start=harvest)
- load_uploadrelease = wade4_uploadrelease(start=load_release)
-
- load_prune = wade4_nabu_prune(start=load_uploadrelease)
- load_prov = wade4_nabuprov(start=load_prune)
- load_org = wade4_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade4_missingreport_graph(start=load_org)
- report_graph=wade4_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py
deleted file mode 100644
index 4c7e0c82..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade5_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade5_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade5_uploadrelease(context):
- returned_value = postRelease("wade5")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade5_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade5"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade5_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade5"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade5_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade5"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade5"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade5_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade5"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade5"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade5"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade5():
- containers = wade5_getImage()
- harvest = wade5_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade5_missingreport_s3(start=harvest)
- report_idstat = wade5_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade5_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade5")
- load_release = wade5_naburelease(start=harvest)
- load_uploadrelease = wade5_uploadrelease(start=load_release)
-
- load_prune = wade5_nabu_prune(start=load_uploadrelease)
- load_prov = wade5_nabuprov(start=load_prune)
- load_org = wade5_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade5_missingreport_graph(start=load_org)
- report_graph=wade5_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py
deleted file mode 100644
index ca3e0953..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade6_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade6_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade6_uploadrelease(context):
- returned_value = postRelease("wade6")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade6_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade6"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade6_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade6"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade6_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade6"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade6"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade6_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade6"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade6"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade6"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade6():
- containers = wade6_getImage()
- harvest = wade6_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade6_missingreport_s3(start=harvest)
- report_idstat = wade6_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade6_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade6")
- load_release = wade6_naburelease(start=harvest)
- load_uploadrelease = wade6_uploadrelease(start=load_release)
-
- load_prune = wade6_nabu_prune(start=load_uploadrelease)
- load_prov = wade6_nabuprov(start=load_prune)
- load_org = wade6_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade6_missingreport_graph(start=load_org)
- report_graph=wade6_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py
deleted file mode 100644
index 2b7294df..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade7_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade7_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade7_uploadrelease(context):
- returned_value = postRelease("wade7")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade7_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade7"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade7_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade7"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade7_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade7"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade7"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade7_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade7"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade7"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade7"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade7():
- containers = wade7_getImage()
- harvest = wade7_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade7_missingreport_s3(start=harvest)
- report_idstat = wade7_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade7_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade7")
- load_release = wade7_naburelease(start=harvest)
- load_uploadrelease = wade7_uploadrelease(start=load_release)
-
- load_prune = wade7_nabu_prune(start=load_uploadrelease)
- load_prov = wade7_nabuprov(start=load_prune)
- load_org = wade7_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade7_missingreport_graph(start=load_org)
- report_graph=wade7_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py
deleted file mode 100644
index 3acff804..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade8_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade8_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade8_uploadrelease(context):
- returned_value = postRelease("wade8")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade8_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade8"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade8_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade8"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade8_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade8"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade8"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade8_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade8"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade8"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade8"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade8():
- containers = wade8_getImage()
- harvest = wade8_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade8_missingreport_s3(start=harvest)
- report_idstat = wade8_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade8_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade8")
- load_release = wade8_naburelease(start=harvest)
- load_uploadrelease = wade8_uploadrelease(start=load_release)
-
- load_prune = wade8_nabu_prune(start=load_uploadrelease)
- load_prov = wade8_nabuprov(start=load_prune)
- load_org = wade8_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade8_missingreport_graph(start=load_org)
- report_graph=wade8_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py
deleted file mode 100644
index dbf4c12e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py
+++ /dev/null
@@ -1,728 +0,0 @@
-import distutils
-import time
-
-from dagster import job, op, graph,In, Nothing, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-
-from docker.types import RestartPolicy, ServiceMode
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
-GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
-GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
-GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
-GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
-GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
-GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
-GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
-GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
-GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
-GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
-GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
-GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
-GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
-def _graphEndpoint():
- url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT
- get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}")
- get_dagster_logger().debug(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = GLEANER_MINIO_USE_SSL,
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= GLEANER_MINIO_USE_SSL
- if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80"
- and secure == False):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443"
- and secure == True):
- server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS)
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=GLEANER_MINIO_ACCESS_KEY,
- secret_key=GLEANER_MINIO_SECRET_KEY,
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = GLEANERIO_LOG_PREFIX + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if GLEANER_MINIO_USE_SSL:
- proto = "https"
- port = GLEANER_MINIO_PORT
- address = GLEANER_MINIO_ADDRESS
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"create docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _create_service(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name="",
- workingdir="/",
-
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"create docker service for {name}")
- ## thoguhts
- # return service, container, since there is one
- restart_policy = RestartPolicy(condition='none')
- # docker.py if replicated job, total completions = replicas
- # replicas =0 you do not get a container
- serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
- get_dagster_logger().info(str(client.configs.list()))
- # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
- get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
- get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
- configs = [gleaner,nabu]
- # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- service = client.services.create(
- image,
- args=command,
- env= env_vars,
- name=name ,
- networks= container_context.networks if len(container_context.networks) else None,
- restart_policy = restart_policy,
- mode=serivce_mode,
- workdir=workingdir,
- configs=configs
- )
- wait_count =0
- while True:
- time.sleep(1)
- wait_count+=1
- get_dagster_logger().debug(str(service.tasks()))
-
- container_task = service.tasks(filters={"service":name})
-
- containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"})
- if len(containers)> 0:
- break
- if wait_count > 12:
- raise f"Container for service {name} not starting"
-
- get_dagster_logger().info(len(containers))
- return service, containers[0]
-
-
-
-
-def gleanerio(context, mode, source):
- ## ------------ Create
- returnCode = 0
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE =GLEANERIO_GLEANER_IMAGE
-
- # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "prune"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = GLEANERIO_NABU_IMAGE
-
- ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"sch_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
-
- returnCode = 1
- return returnCode
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = ARGS
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
-
-
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- # "volumes": {
- # f"{GLEANER_CONFIG_VOLUME}":
- # {'bind': '/configs', 'mode': 'rw'}
- # },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_service: ")
- service, container = _create_service(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
- workingdir=data["WorkingDir"]
- )
- except Exception as err:
- raise err
-
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
- get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ")
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + '/containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
-
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- if (service):
- service.remove()
- get_dagster_logger().info(f"Service Remove: {service.name}")
- else:
- get_dagster_logger().info(f"Service Not created, so not removed.")
-
- else:
- get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
-
- if (returnCode != 0):
- get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
- raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
- return returnCode
-
-@op
-def wade9_getImage(context):
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(run_container_context)
- client.images.pull(GLEANERIO_GLEANER_IMAGE)
- client.images.pull(GLEANERIO_NABU_IMAGE)
-@op(ins={"start": In(Nothing)})
-def wade9_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prune returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu prov returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu org load returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_naburelease(context):
- returned_value = gleanerio(context,("release"), "wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"nabu release returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade9_uploadrelease(context):
- returned_value = postRelease("wade9")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"upload release returned {r} ")
- return
-
-
-@op(ins={"start": In(Nothing)})
-def wade9_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade9"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- get_dagster_logger().info(f"missing s3 report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade9_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade9"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
- get_dagster_logger().info(f"missing graph report returned {r} ")
- return
-@op(ins={"start": In(Nothing)})
-def wade9_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade9"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade9"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- get_dagster_logger().info(f"identifer stats report returned {r} ")
- return
-
-@op(ins={"start": In(Nothing)})
-def wade9_bucket_urls(context):
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "wade9"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- get_dagster_logger().info(f"bucker urls report returned {r} ")
- return
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="wade9"):
-#
-# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="wade9"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_wade9():
- containers = wade9_getImage()
- harvest = wade9_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = wade9_missingreport_s3(start=harvest)
- report_idstat = wade9_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = wade9_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="wade9")
- load_release = wade9_naburelease(start=harvest)
- load_uploadrelease = wade9_uploadrelease(start=load_release)
-
- load_prune = wade9_nabu_prune(start=load_uploadrelease)
- load_prov = wade9_nabuprov(start=load_prune)
- load_org = wade9_nabuorg(start=load_prov)
-
-# run after load
- report_msgraph=wade9_missingreport_graph(start=load_org)
- report_graph=wade9_graph_reports(start=report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py
deleted file mode 100644
index f7f74595..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py
+++ /dev/null
@@ -1,371 +0,0 @@
-from dagster import repository
-from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20
-from sch.implnet_sch_nwisgw20 import implnet_sch_nwisgw20
-from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22
-from sch.implnet_sch_nwisgw22 import implnet_sch_nwisgw22
-from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16
-from sch.implnet_sch_nwisgw16 import implnet_sch_nwisgw16
-from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12
-from sch.implnet_sch_nwisgw12 import implnet_sch_nwisgw12
-from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25
-from sch.implnet_sch_nwisgw25 import implnet_sch_nwisgw25
-from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14
-from sch.implnet_sch_nwisgw14 import implnet_sch_nwisgw14
-from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23
-from sch.implnet_sch_nwisgw23 import implnet_sch_nwisgw23
-from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10
-from sch.implnet_sch_nwisgw10 import implnet_sch_nwisgw10
-from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15
-from sch.implnet_sch_nwisgw15 import implnet_sch_nwisgw15
-from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2
-from sch.implnet_sch_nwisgw2 import implnet_sch_nwisgw2
-from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24
-from sch.implnet_sch_nwisgw24 import implnet_sch_nwisgw24
-from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9
-from sch.implnet_sch_nwisgw9 import implnet_sch_nwisgw9
-from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19
-from sch.implnet_sch_nwisgw19 import implnet_sch_nwisgw19
-from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28
-from sch.implnet_sch_nwisgw28 import implnet_sch_nwisgw28
-from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26
-from sch.implnet_sch_nwisgw26 import implnet_sch_nwisgw26
-from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5
-from sch.implnet_sch_nwisgw5 import implnet_sch_nwisgw5
-from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13
-from sch.implnet_sch_nwisgw13 import implnet_sch_nwisgw13
-from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6
-from sch.implnet_sch_nwisgw6 import implnet_sch_nwisgw6
-from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3
-from sch.implnet_sch_nwisgw3 import implnet_sch_nwisgw3
-from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4
-from sch.implnet_sch_nwisgw4 import implnet_sch_nwisgw4
-from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1
-from sch.implnet_sch_nwisgw1 import implnet_sch_nwisgw1
-from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21
-from sch.implnet_sch_nwisgw21 import implnet_sch_nwisgw21
-from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27
-from sch.implnet_sch_nwisgw27 import implnet_sch_nwisgw27
-from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8
-from sch.implnet_sch_nwisgw8 import implnet_sch_nwisgw8
-from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17
-from sch.implnet_sch_nwisgw17 import implnet_sch_nwisgw17
-from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18
-from sch.implnet_sch_nwisgw18 import implnet_sch_nwisgw18
-from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7
-from sch.implnet_sch_nwisgw7 import implnet_sch_nwisgw7
-from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11
-from sch.implnet_sch_nwisgw11 import implnet_sch_nwisgw11
-from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0
-from sch.implnet_sch_nwisgw0 import implnet_sch_nwisgw0
-from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1
-from sch.implnet_sch_nwissite1 import implnet_sch_nwissite1
-from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3
-from sch.implnet_sch_nwissite3 import implnet_sch_nwissite3
-from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0
-from sch.implnet_sch_nwissite0 import implnet_sch_nwissite0
-from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2
-from sch.implnet_sch_nwissite2 import implnet_sch_nwissite2
-from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1
-from sch.implnet_sch_gfv11pois1 import implnet_sch_gfv11pois1
-from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0
-from sch.implnet_sch_gfv11pois0 import implnet_sch_gfv11pois0
-from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0
-from sch.implnet_sch_hydrologicunit0 import implnet_sch_hydrologicunit0
-from jobs.implnet_jobs_damspids0 import implnet_job_damspids0
-from sch.implnet_sch_damspids0 import implnet_sch_damspids0
-from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0
-from sch.implnet_sch_cuahsihishydrodataczhrids0 import implnet_sch_cuahsihishydrodataczhrids0
-from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0
-from sch.implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 import implnet_sch_cuahsihisnooksackmicroclimatenetworkids0
-from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0
-from sch.implnet_sch_cuahsihisneonids0 import implnet_sch_cuahsihisneonids0
-from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0
-from sch.implnet_sch_cuahsihisglobalriversobservatoryids0 import implnet_sch_cuahsihisglobalriversobservatoryids0
-from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0
-from sch.implnet_sch_cuahsihistncwaterdataids0 import implnet_sch_cuahsihistncwaterdataids0
-from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0
-from sch.implnet_sch_cuahsihisscotlandnwisids0 import implnet_sch_cuahsihisscotlandnwisids0
-from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0
-from sch.implnet_sch_cuahsihisczoboulderids0 import implnet_sch_cuahsihisczoboulderids0
-from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0
-from sch.implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 import implnet_sch_cuahsihisyosemitehydroclimatenetworkids0
-from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0
-from sch.implnet_sch_cuahsihismuddyriverids0 import implnet_sch_cuahsihismuddyriverids0
-from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0
-from sch.implnet_sch_cuahsihisczomercedids0 import implnet_sch_cuahsihisczomercedids0
-from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0
-from sch.implnet_sch_cuahsihisghcnids0 import implnet_sch_cuahsihisghcnids0
-from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0
-from sch.implnet_sch_cuahsihismmaatacamaids0 import implnet_sch_cuahsihismmaatacamaids0
-from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0
-from sch.implnet_sch_cuahsihisumbcwqids0 import implnet_sch_cuahsihisumbcwqids0
-from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0
-from sch.implnet_sch_cuahsihisgleonlakeannieids0 import implnet_sch_cuahsihisgleonlakeannieids0
-from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0
-from sch.implnet_sch_cuahsihisluwlids0 import implnet_sch_cuahsihisluwlids0
-from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0
-from sch.implnet_sch_cuahsihiscedarriverids0 import implnet_sch_cuahsihiscedarriverids0
-from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0
-from sch.implnet_sch_cuahsihisccbepdapids0 import implnet_sch_cuahsihisccbepdapids0
-from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0
-from sch.implnet_sch_cuahsihiskansasweatherdataids0 import implnet_sch_cuahsihiskansasweatherdataids0
-from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0
-from sch.implnet_sch_cuahsihisodmkentstateids0 import implnet_sch_cuahsihisodmkentstateids0
-from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0
-from sch.implnet_sch_cuahsihisgleondorsetids0 import implnet_sch_cuahsihisgleondorsetids0
-from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0
-from sch.implnet_sch_cuahsihisclarksburgspids0 import implnet_sch_cuahsihisclarksburgspids0
-from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0
-from sch.implnet_sch_cuahsihiscrwaids0 import implnet_sch_cuahsihiscrwaids0
-from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0
-from sch.implnet_sch_cuahsihiscuisoids0 import implnet_sch_cuahsihiscuisoids0
-from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0
-from sch.implnet_sch_cuahsihisprovorivergamutids0 import implnet_sch_cuahsihisprovorivergamutids0
-from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0
-from sch.implnet_sch_cuahsihisirwaids0 import implnet_sch_cuahsihisirwaids0
-from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0
-from sch.implnet_sch_cuahsihisczoluquilloids0 import implnet_sch_cuahsihisczoluquilloids0
-from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0
-from sch.implnet_sch_cuahsihistuolumnemdwids0 import implnet_sch_cuahsihistuolumnemdwids0
-from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0
-from sch.implnet_sch_cuahsihisrmblids0 import implnet_sch_cuahsihisrmblids0
-from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0
-from sch.implnet_sch_cuahsihispanolaodmids0 import implnet_sch_cuahsihispanolaodmids0
-from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0
-from sch.implnet_sch_cuahsihisnewnids0 import implnet_sch_cuahsihisnewnids0
-from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0
-from sch.implnet_sch_cuahsihisczoudelids0 import implnet_sch_cuahsihisczoudelids0
-from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0
-from sch.implnet_sch_cuahsihisfarmrwaids0 import implnet_sch_cuahsihisfarmrwaids0
-from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0
-from sch.implnet_sch_cuahsihisskcmilltownids0 import implnet_sch_cuahsihisskcmilltownids0
-from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0
-from sch.implnet_sch_cuahsihisumbcgwids0 import implnet_sch_cuahsihisumbcgwids0
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0
-from sch.implnet_sch_cuahsihisshalenetworkodmids0 import implnet_sch_cuahsihisshalenetworkodmids0
-from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0
-from sch.implnet_sch_cuahsihisnevadosids0 import implnet_sch_cuahsihisnevadosids0
-from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0
-from sch.implnet_sch_cuahsihisweiherbachids0 import implnet_sch_cuahsihisweiherbachids0
-from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0
-from sch.implnet_sch_cuahsihismazarriverprojectids0 import implnet_sch_cuahsihismazarriverprojectids0
-from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0
-from sch.implnet_sch_cuahsihisgleonsunapeeids0 import implnet_sch_cuahsihisgleonsunapeeids0
-from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0
-from sch.implnet_sch_cuahsihisorsancohabids0 import implnet_sch_cuahsihisorsancohabids0
-from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0
-from sch.implnet_sch_cuahsihismwraids0 import implnet_sch_cuahsihismwraids0
-from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0
-from sch.implnet_sch_cuahsihismaaeriids0 import implnet_sch_cuahsihismaaeriids0
-from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0
-from sch.implnet_sch_cuahsihisnceiww2ids0 import implnet_sch_cuahsihisnceiww2ids0
-from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0
-from sch.implnet_sch_cuahsihistarlandwaterqualityids0 import implnet_sch_cuahsihistarlandwaterqualityids0
-from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0
-from sch.implnet_sch_cuahsihislczoodm2ids0 import implnet_sch_cuahsihislczoodm2ids0
-from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0
-from sch.implnet_sch_cuahsihiscocorahsids0 import implnet_sch_cuahsihiscocorahsids0
-from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0
-from sch.implnet_sch_cuahsihisparalanaturalezaids0 import implnet_sch_cuahsihisparalanaturalezaids0
-from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0
-from sch.implnet_sch_cuahsihisczocatalinaids0 import implnet_sch_cuahsihisczocatalinaids0
-from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0
-from sch.implnet_sch_cuahsihisieeratwilkesuniversityids0 import implnet_sch_cuahsihisieeratwilkesuniversityids0
-from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0
-from sch.implnet_sch_cuahsihismudlakeids0 import implnet_sch_cuahsihismudlakeids0
-from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0
-from sch.implnet_sch_cuahsihismwdisids0 import implnet_sch_cuahsihismwdisids0
-from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0
-from sch.implnet_sch_cuahsihisloganriverids0 import implnet_sch_cuahsihisloganriverids0
-from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0
-from sch.implnet_sch_cuahsihisscanids0 import implnet_sch_cuahsihisscanids0
-from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0
-from sch.implnet_sch_cuahsihisnashrwaids0 import implnet_sch_cuahsihisnashrwaids0
-from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0
-from sch.implnet_sch_cuahsihismobilecrowdhydrologyids0 import implnet_sch_cuahsihismobilecrowdhydrologyids0
-from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0
-from sch.implnet_sch_cuahsihisandrewsforestlterids0 import implnet_sch_cuahsihisandrewsforestlterids0
-from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0
-from sch.implnet_sch_cuahsihisloganrivergamutids0 import implnet_sch_cuahsihisloganrivergamutids0
-from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0
-from sch.implnet_sch_cuahsihislittlebearriverids0 import implnet_sch_cuahsihislittlebearriverids0
-from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0
-from sch.implnet_sch_cuahsihislterntlwoodruffids0 import implnet_sch_cuahsihislterntlwoodruffids0
-from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0
-from sch.implnet_sch_cuahsihissagehencreekids0 import implnet_sch_cuahsihissagehencreekids0
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1
-from sch.implnet_sch_cuahsihisshalenetworkodmids1 import implnet_sch_cuahsihisshalenetworkodmids1
-from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0
-from sch.implnet_sch_cuahsihisfrcwqmids0 import implnet_sch_cuahsihisfrcwqmids0
-from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0
-from sch.implnet_sch_cuahsihishydrodataczdids0 import implnet_sch_cuahsihishydrodataczdids0
-from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0
-from sch.implnet_sch_cuahsihisdrwiids0 import implnet_sch_cuahsihisdrwiids0
-from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0
-from sch.implnet_sch_cuahsihisubwpadids0 import implnet_sch_cuahsihisubwpadids0
-from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0
-from sch.implnet_sch_cuahsihistrwaids0 import implnet_sch_cuahsihistrwaids0
-from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0
-from sch.implnet_sch_cuahsihisredbuttecreekgamutids0 import implnet_sch_cuahsihisredbuttecreekgamutids0
-from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0
-from sch.implnet_sch_cuahsihisglacialridgeids0 import implnet_sch_cuahsihisglacialridgeids0
-from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0
-from sch.implnet_sch_cuahsihisfcelterids0 import implnet_sch_cuahsihisfcelterids0
-from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0
-from sch.implnet_sch_cuahsihisczoarizids0 import implnet_sch_cuahsihisczoarizids0
-from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0
-from sch.implnet_sch_cuahsihiscalvinhhsids0 import implnet_sch_cuahsihiscalvinhhsids0
-from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0
-from sch.implnet_sch_cuahsihissnotelids0 import implnet_sch_cuahsihissnotelids0
-from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0
-from sch.implnet_sch_cuahsihisnevcanids0 import implnet_sch_cuahsihisnevcanids0
-from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0
-from sch.implnet_sch_cuahsihisczopsuids0 import implnet_sch_cuahsihisczopsuids0
-from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0
-from sch.implnet_sch_cuahsihisbrazilucbids0 import implnet_sch_cuahsihisbrazilucbids0
-from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0
-from sch.implnet_sch_cuahsihisgleonauburnids0 import implnet_sch_cuahsihisgleonauburnids0
-from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0
-from sch.implnet_sch_cuahsihislaselvastreamdischargeids0 import implnet_sch_cuahsihislaselvastreamdischargeids0
-from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0
-from sch.implnet_sch_cuahsihisisbenaids0 import implnet_sch_cuahsihisisbenaids0
-from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0
-from sch.implnet_sch_cuahsihisswedishmonitoringdataids0 import implnet_sch_cuahsihisswedishmonitoringdataids0
-from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0
-from sch.implnet_sch_cuahsihisunhsnowids0 import implnet_sch_cuahsihisunhsnowids0
-from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0
-from sch.implnet_sch_cuahsihishassbergeids0 import implnet_sch_cuahsihishassbergeids0
-from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0
-from sch.implnet_sch_cuahsihisnhgswofids0 import implnet_sch_cuahsihisnhgswofids0
-from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0
-from sch.implnet_sch_cuahsihisgonggaids0 import implnet_sch_cuahsihisgonggaids0
-from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0
-from sch.implnet_sch_cuahsihismopexids0 import implnet_sch_cuahsihismopexids0
-from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0
-from sch.implnet_sch_cagagespids0 import implnet_sch_cagagespids0
-from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0
-from sch.implnet_sch_sechydrgreg0 import implnet_sch_sechydrgreg0
-from jobs.implnet_jobs_counties0 import implnet_job_counties0
-from sch.implnet_sch_counties0 import implnet_sch_counties0
-from jobs.implnet_jobs_pws0 import implnet_job_pws0
-from sch.implnet_sch_pws0 import implnet_sch_pws0
-from jobs.implnet_jobs_hu060 import implnet_job_hu060
-from sch.implnet_sch_hu060 import implnet_sch_hu060
-from jobs.implnet_jobs_nataq0 import implnet_job_nataq0
-from sch.implnet_sch_nataq0 import implnet_sch_nataq0
-from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0
-from sch.implnet_sch_cbsa0 import implnet_sch_cbsa0
-from jobs.implnet_jobs_hu080 import implnet_job_hu080
-from sch.implnet_sch_hu080 import implnet_sch_hu080
-from jobs.implnet_jobs_hu040 import implnet_job_hu040
-from sch.implnet_sch_hu040 import implnet_sch_hu040
-from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0
-from sch.implnet_sch_princiaq0 import implnet_sch_princiaq0
-from jobs.implnet_jobs_refgage0 import implnet_job_refgage0
-from sch.implnet_sch_refgage0 import implnet_sch_refgage0
-from jobs.implnet_jobs_refgage3 import implnet_job_refgage3
-from sch.implnet_sch_refgage3 import implnet_sch_refgage3
-from jobs.implnet_jobs_refgage2 import implnet_job_refgage2
-from sch.implnet_sch_refgage2 import implnet_sch_refgage2
-from jobs.implnet_jobs_refgage1 import implnet_job_refgage1
-from sch.implnet_sch_refgage1 import implnet_sch_refgage1
-from jobs.implnet_jobs_dams0 import implnet_job_dams0
-from sch.implnet_sch_dams0 import implnet_sch_dams0
-from jobs.implnet_jobs_dams1 import implnet_job_dams1
-from sch.implnet_sch_dams1 import implnet_sch_dams1
-from jobs.implnet_jobs_ua100 import implnet_job_ua100
-from sch.implnet_sch_ua100 import implnet_sch_ua100
-from jobs.implnet_jobs_states0 import implnet_job_states0
-from sch.implnet_sch_states0 import implnet_sch_states0
-from jobs.implnet_jobs_hu100 import implnet_job_hu100
-from sch.implnet_sch_hu100 import implnet_sch_hu100
-from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0
-from sch.implnet_sch_aiannh0 import implnet_sch_aiannh0
-from jobs.implnet_jobs_hu020 import implnet_job_hu020
-from sch.implnet_sch_hu020 import implnet_sch_hu020
-from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0
-from sch.implnet_sch_mainstems0 import implnet_sch_mainstems0
-from jobs.implnet_jobs_places0 import implnet_job_places0
-from sch.implnet_sch_places0 import implnet_sch_places0
-from jobs.implnet_jobs_hmw0 import implnet_job_hmw0
-from sch.implnet_sch_hmw0 import implnet_sch_hmw0
-from jobs.implnet_jobs_hmw1 import implnet_job_hmw1
-from sch.implnet_sch_hmw1 import implnet_sch_hmw1
-from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0
-from sch.implnet_sch_huc12pp0 import implnet_sch_huc12pp0
-from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1
-from sch.implnet_sch_huc12pp1 import implnet_sch_huc12pp1
-from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3
-from sch.implnet_sch_nmwdiose3 import implnet_sch_nmwdiose3
-from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2
-from sch.implnet_sch_nmwdiose2 import implnet_sch_nmwdiose2
-from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0
-from sch.implnet_sch_nmwdiose0 import implnet_sch_nmwdiose0
-from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4
-from sch.implnet_sch_nmwdiose4 import implnet_sch_nmwdiose4
-from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1
-from sch.implnet_sch_nmwdiose1 import implnet_sch_nmwdiose1
-from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0
-from sch.implnet_sch_nmwdist0 import implnet_sch_nmwdist0
-from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0
-from sch.implnet_sch_selfieids0 import implnet_sch_selfieids0
-from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0
-from sch.implnet_sch_chyldpilotids0 import implnet_sch_chyldpilotids0
-from jobs.implnet_jobs_rise0 import implnet_job_rise0
-from sch.implnet_sch_rise0 import implnet_sch_rise0
-from jobs.implnet_jobs_autotest10 import implnet_job_autotest10
-from sch.implnet_sch_autotest10 import implnet_sch_autotest10
-from jobs.implnet_jobs_links0 import implnet_job_links0
-from sch.implnet_sch_links0 import implnet_sch_links0
-from jobs.implnet_jobs_demo0 import implnet_job_demo0
-from sch.implnet_sch_demo0 import implnet_sch_demo0
-from jobs.implnet_jobs_autotest20 import implnet_job_autotest20
-from sch.implnet_sch_autotest20 import implnet_sch_autotest20
-from jobs.implnet_jobs_wade2 import implnet_job_wade2
-from sch.implnet_sch_wade2 import implnet_sch_wade2
-from jobs.implnet_jobs_wade0 import implnet_job_wade0
-from sch.implnet_sch_wade0 import implnet_sch_wade0
-from jobs.implnet_jobs_wade17 import implnet_job_wade17
-from sch.implnet_sch_wade17 import implnet_sch_wade17
-from jobs.implnet_jobs_wade9 import implnet_job_wade9
-from sch.implnet_sch_wade9 import implnet_sch_wade9
-from jobs.implnet_jobs_wade7 import implnet_job_wade7
-from sch.implnet_sch_wade7 import implnet_sch_wade7
-from jobs.implnet_jobs_wade3 import implnet_job_wade3
-from sch.implnet_sch_wade3 import implnet_sch_wade3
-from jobs.implnet_jobs_wade15 import implnet_job_wade15
-from sch.implnet_sch_wade15 import implnet_sch_wade15
-from jobs.implnet_jobs_wade5 import implnet_job_wade5
-from sch.implnet_sch_wade5 import implnet_sch_wade5
-from jobs.implnet_jobs_wade10 import implnet_job_wade10
-from sch.implnet_sch_wade10 import implnet_sch_wade10
-from jobs.implnet_jobs_wade14 import implnet_job_wade14
-from sch.implnet_sch_wade14 import implnet_sch_wade14
-from jobs.implnet_jobs_wade18 import implnet_job_wade18
-from sch.implnet_sch_wade18 import implnet_sch_wade18
-from jobs.implnet_jobs_wade13 import implnet_job_wade13
-from sch.implnet_sch_wade13 import implnet_sch_wade13
-from jobs.implnet_jobs_wade8 import implnet_job_wade8
-from sch.implnet_sch_wade8 import implnet_sch_wade8
-from jobs.implnet_jobs_wade19 import implnet_job_wade19
-from sch.implnet_sch_wade19 import implnet_sch_wade19
-from jobs.implnet_jobs_wade12 import implnet_job_wade12
-from sch.implnet_sch_wade12 import implnet_sch_wade12
-from jobs.implnet_jobs_wade4 import implnet_job_wade4
-from sch.implnet_sch_wade4 import implnet_sch_wade4
-from jobs.implnet_jobs_wade16 import implnet_job_wade16
-from sch.implnet_sch_wade16 import implnet_sch_wade16
-from jobs.implnet_jobs_wade1 import implnet_job_wade1
-from sch.implnet_sch_wade1 import implnet_sch_wade1
-from jobs.implnet_jobs_wade6 import implnet_job_wade6
-from sch.implnet_sch_wade6 import implnet_sch_wade6
-from jobs.implnet_jobs_wade11 import implnet_job_wade11
-from sch.implnet_sch_wade11 import implnet_sch_wade11
-
-@repository
-def gleaner():
- jobs = [implnet_job_nwisgw20, implnet_job_nwisgw22, implnet_job_nwisgw16, implnet_job_nwisgw12, implnet_job_nwisgw25, implnet_job_nwisgw14, implnet_job_nwisgw23, implnet_job_nwisgw10, implnet_job_nwisgw15, implnet_job_nwisgw2, implnet_job_nwisgw24, implnet_job_nwisgw9, implnet_job_nwisgw19, implnet_job_nwisgw28, implnet_job_nwisgw26, implnet_job_nwisgw5, implnet_job_nwisgw13, implnet_job_nwisgw6, implnet_job_nwisgw3, implnet_job_nwisgw4, implnet_job_nwisgw1, implnet_job_nwisgw21, implnet_job_nwisgw27, implnet_job_nwisgw8, implnet_job_nwisgw17, implnet_job_nwisgw18, implnet_job_nwisgw7, implnet_job_nwisgw11, implnet_job_nwisgw0, implnet_job_nwissite1, implnet_job_nwissite3, implnet_job_nwissite0, implnet_job_nwissite2, implnet_job_gfv11pois1, implnet_job_gfv11pois0, implnet_job_hydrologicunit0, implnet_job_damspids0, implnet_job_cuahsihishydrodataczhrids0, implnet_job_cuahsihisnooksackmicroclimatenetworkids0, implnet_job_cuahsihisneonids0, implnet_job_cuahsihisglobalriversobservatoryids0, implnet_job_cuahsihistncwaterdataids0, implnet_job_cuahsihisscotlandnwisids0, implnet_job_cuahsihisczoboulderids0, implnet_job_cuahsihisyosemitehydroclimatenetworkids0, implnet_job_cuahsihismuddyriverids0, implnet_job_cuahsihisczomercedids0, implnet_job_cuahsihisghcnids0, implnet_job_cuahsihismmaatacamaids0, implnet_job_cuahsihisumbcwqids0, implnet_job_cuahsihisgleonlakeannieids0, implnet_job_cuahsihisluwlids0, implnet_job_cuahsihiscedarriverids0, implnet_job_cuahsihisccbepdapids0, implnet_job_cuahsihiskansasweatherdataids0, implnet_job_cuahsihisodmkentstateids0, implnet_job_cuahsihisgleondorsetids0, implnet_job_cuahsihisclarksburgspids0, implnet_job_cuahsihiscrwaids0, implnet_job_cuahsihiscuisoids0, implnet_job_cuahsihisprovorivergamutids0, implnet_job_cuahsihisirwaids0, implnet_job_cuahsihisczoluquilloids0, implnet_job_cuahsihistuolumnemdwids0, implnet_job_cuahsihisrmblids0, implnet_job_cuahsihispanolaodmids0, implnet_job_cuahsihisnewnids0, implnet_job_cuahsihisczoudelids0, implnet_job_cuahsihisfarmrwaids0, implnet_job_cuahsihisskcmilltownids0, implnet_job_cuahsihisumbcgwids0, implnet_job_cuahsihisshalenetworkodmids0, implnet_job_cuahsihisnevadosids0, implnet_job_cuahsihisweiherbachids0, implnet_job_cuahsihismazarriverprojectids0, implnet_job_cuahsihisgleonsunapeeids0, implnet_job_cuahsihisorsancohabids0, implnet_job_cuahsihismwraids0, implnet_job_cuahsihismaaeriids0, implnet_job_cuahsihisnceiww2ids0, implnet_job_cuahsihistarlandwaterqualityids0, implnet_job_cuahsihislczoodm2ids0, implnet_job_cuahsihiscocorahsids0, implnet_job_cuahsihisparalanaturalezaids0, implnet_job_cuahsihisczocatalinaids0, implnet_job_cuahsihisieeratwilkesuniversityids0, implnet_job_cuahsihismudlakeids0, implnet_job_cuahsihismwdisids0, implnet_job_cuahsihisloganriverids0, implnet_job_cuahsihisscanids0, implnet_job_cuahsihisnashrwaids0, implnet_job_cuahsihismobilecrowdhydrologyids0, implnet_job_cuahsihisandrewsforestlterids0, implnet_job_cuahsihisloganrivergamutids0, implnet_job_cuahsihislittlebearriverids0, implnet_job_cuahsihislterntlwoodruffids0, implnet_job_cuahsihissagehencreekids0, implnet_job_cuahsihisshalenetworkodmids1, implnet_job_cuahsihisfrcwqmids0, implnet_job_cuahsihishydrodataczdids0, implnet_job_cuahsihisdrwiids0, implnet_job_cuahsihisubwpadids0, implnet_job_cuahsihistrwaids0, implnet_job_cuahsihisredbuttecreekgamutids0, implnet_job_cuahsihisglacialridgeids0, implnet_job_cuahsihisfcelterids0, implnet_job_cuahsihisczoarizids0, implnet_job_cuahsihiscalvinhhsids0, implnet_job_cuahsihissnotelids0, implnet_job_cuahsihisnevcanids0, implnet_job_cuahsihisczopsuids0, implnet_job_cuahsihisbrazilucbids0, implnet_job_cuahsihisgleonauburnids0, implnet_job_cuahsihislaselvastreamdischargeids0, implnet_job_cuahsihisisbenaids0, implnet_job_cuahsihisswedishmonitoringdataids0, implnet_job_cuahsihisunhsnowids0, implnet_job_cuahsihishassbergeids0, implnet_job_cuahsihisnhgswofids0, implnet_job_cuahsihisgonggaids0, implnet_job_cuahsihismopexids0, implnet_job_cagagespids0, implnet_job_sechydrgreg0, implnet_job_counties0, implnet_job_pws0, implnet_job_hu060, implnet_job_nataq0, implnet_job_cbsa0, implnet_job_hu080, implnet_job_hu040, implnet_job_princiaq0, implnet_job_refgage0, implnet_job_refgage3, implnet_job_refgage2, implnet_job_refgage1, implnet_job_dams0, implnet_job_dams1, implnet_job_ua100, implnet_job_states0, implnet_job_hu100, implnet_job_aiannh0, implnet_job_hu020, implnet_job_mainstems0, implnet_job_places0, implnet_job_hmw0, implnet_job_hmw1, implnet_job_huc12pp0, implnet_job_huc12pp1, implnet_job_nmwdiose3, implnet_job_nmwdiose2, implnet_job_nmwdiose0, implnet_job_nmwdiose4, implnet_job_nmwdiose1, implnet_job_nmwdist0, implnet_job_selfieids0, implnet_job_chyldpilotids0, implnet_job_rise0, implnet_job_autotest10, implnet_job_links0, implnet_job_demo0, implnet_job_autotest20, implnet_job_wade2, implnet_job_wade0, implnet_job_wade17, implnet_job_wade9, implnet_job_wade7, implnet_job_wade3, implnet_job_wade15, implnet_job_wade5, implnet_job_wade10, implnet_job_wade14, implnet_job_wade18, implnet_job_wade13, implnet_job_wade8, implnet_job_wade19, implnet_job_wade12, implnet_job_wade4, implnet_job_wade16, implnet_job_wade1, implnet_job_wade6, implnet_job_wade11]
- schedules = [implnet_sch_nwisgw20, implnet_sch_nwisgw22, implnet_sch_nwisgw16, implnet_sch_nwisgw12, implnet_sch_nwisgw25, implnet_sch_nwisgw14, implnet_sch_nwisgw23, implnet_sch_nwisgw10, implnet_sch_nwisgw15, implnet_sch_nwisgw2, implnet_sch_nwisgw24, implnet_sch_nwisgw9, implnet_sch_nwisgw19, implnet_sch_nwisgw28, implnet_sch_nwisgw26, implnet_sch_nwisgw5, implnet_sch_nwisgw13, implnet_sch_nwisgw6, implnet_sch_nwisgw3, implnet_sch_nwisgw4, implnet_sch_nwisgw1, implnet_sch_nwisgw21, implnet_sch_nwisgw27, implnet_sch_nwisgw8, implnet_sch_nwisgw17, implnet_sch_nwisgw18, implnet_sch_nwisgw7, implnet_sch_nwisgw11, implnet_sch_nwisgw0, implnet_sch_nwissite1, implnet_sch_nwissite3, implnet_sch_nwissite0, implnet_sch_nwissite2, implnet_sch_gfv11pois1, implnet_sch_gfv11pois0, implnet_sch_hydrologicunit0, implnet_sch_damspids0, implnet_sch_cuahsihishydrodataczhrids0, implnet_sch_cuahsihisnooksackmicroclimatenetworkids0, implnet_sch_cuahsihisneonids0, implnet_sch_cuahsihisglobalriversobservatoryids0, implnet_sch_cuahsihistncwaterdataids0, implnet_sch_cuahsihisscotlandnwisids0, implnet_sch_cuahsihisczoboulderids0, implnet_sch_cuahsihisyosemitehydroclimatenetworkids0, implnet_sch_cuahsihismuddyriverids0, implnet_sch_cuahsihisczomercedids0, implnet_sch_cuahsihisghcnids0, implnet_sch_cuahsihismmaatacamaids0, implnet_sch_cuahsihisumbcwqids0, implnet_sch_cuahsihisgleonlakeannieids0, implnet_sch_cuahsihisluwlids0, implnet_sch_cuahsihiscedarriverids0, implnet_sch_cuahsihisccbepdapids0, implnet_sch_cuahsihiskansasweatherdataids0, implnet_sch_cuahsihisodmkentstateids0, implnet_sch_cuahsihisgleondorsetids0, implnet_sch_cuahsihisclarksburgspids0, implnet_sch_cuahsihiscrwaids0, implnet_sch_cuahsihiscuisoids0, implnet_sch_cuahsihisprovorivergamutids0, implnet_sch_cuahsihisirwaids0, implnet_sch_cuahsihisczoluquilloids0, implnet_sch_cuahsihistuolumnemdwids0, implnet_sch_cuahsihisrmblids0, implnet_sch_cuahsihispanolaodmids0, implnet_sch_cuahsihisnewnids0, implnet_sch_cuahsihisczoudelids0, implnet_sch_cuahsihisfarmrwaids0, implnet_sch_cuahsihisskcmilltownids0, implnet_sch_cuahsihisumbcgwids0, implnet_sch_cuahsihisshalenetworkodmids0, implnet_sch_cuahsihisnevadosids0, implnet_sch_cuahsihisweiherbachids0, implnet_sch_cuahsihismazarriverprojectids0, implnet_sch_cuahsihisgleonsunapeeids0, implnet_sch_cuahsihisorsancohabids0, implnet_sch_cuahsihismwraids0, implnet_sch_cuahsihismaaeriids0, implnet_sch_cuahsihisnceiww2ids0, implnet_sch_cuahsihistarlandwaterqualityids0, implnet_sch_cuahsihislczoodm2ids0, implnet_sch_cuahsihiscocorahsids0, implnet_sch_cuahsihisparalanaturalezaids0, implnet_sch_cuahsihisczocatalinaids0, implnet_sch_cuahsihisieeratwilkesuniversityids0, implnet_sch_cuahsihismudlakeids0, implnet_sch_cuahsihismwdisids0, implnet_sch_cuahsihisloganriverids0, implnet_sch_cuahsihisscanids0, implnet_sch_cuahsihisnashrwaids0, implnet_sch_cuahsihismobilecrowdhydrologyids0, implnet_sch_cuahsihisandrewsforestlterids0, implnet_sch_cuahsihisloganrivergamutids0, implnet_sch_cuahsihislittlebearriverids0, implnet_sch_cuahsihislterntlwoodruffids0, implnet_sch_cuahsihissagehencreekids0, implnet_sch_cuahsihisshalenetworkodmids1, implnet_sch_cuahsihisfrcwqmids0, implnet_sch_cuahsihishydrodataczdids0, implnet_sch_cuahsihisdrwiids0, implnet_sch_cuahsihisubwpadids0, implnet_sch_cuahsihistrwaids0, implnet_sch_cuahsihisredbuttecreekgamutids0, implnet_sch_cuahsihisglacialridgeids0, implnet_sch_cuahsihisfcelterids0, implnet_sch_cuahsihisczoarizids0, implnet_sch_cuahsihiscalvinhhsids0, implnet_sch_cuahsihissnotelids0, implnet_sch_cuahsihisnevcanids0, implnet_sch_cuahsihisczopsuids0, implnet_sch_cuahsihisbrazilucbids0, implnet_sch_cuahsihisgleonauburnids0, implnet_sch_cuahsihislaselvastreamdischargeids0, implnet_sch_cuahsihisisbenaids0, implnet_sch_cuahsihisswedishmonitoringdataids0, implnet_sch_cuahsihisunhsnowids0, implnet_sch_cuahsihishassbergeids0, implnet_sch_cuahsihisnhgswofids0, implnet_sch_cuahsihisgonggaids0, implnet_sch_cuahsihismopexids0, implnet_sch_cagagespids0, implnet_sch_sechydrgreg0, implnet_sch_counties0, implnet_sch_pws0, implnet_sch_hu060, implnet_sch_nataq0, implnet_sch_cbsa0, implnet_sch_hu080, implnet_sch_hu040, implnet_sch_princiaq0, implnet_sch_refgage0, implnet_sch_refgage3, implnet_sch_refgage2, implnet_sch_refgage1, implnet_sch_dams0, implnet_sch_dams1, implnet_sch_ua100, implnet_sch_states0, implnet_sch_hu100, implnet_sch_aiannh0, implnet_sch_hu020, implnet_sch_mainstems0, implnet_sch_places0, implnet_sch_hmw0, implnet_sch_hmw1, implnet_sch_huc12pp0, implnet_sch_huc12pp1, implnet_sch_nmwdiose3, implnet_sch_nmwdiose2, implnet_sch_nmwdiose0, implnet_sch_nmwdiose4, implnet_sch_nmwdiose1, implnet_sch_nmwdist0, implnet_sch_selfieids0, implnet_sch_chyldpilotids0, implnet_sch_rise0, implnet_sch_autotest10, implnet_sch_links0, implnet_sch_demo0, implnet_sch_autotest20, implnet_sch_wade2, implnet_sch_wade0, implnet_sch_wade17, implnet_sch_wade9, implnet_sch_wade7, implnet_sch_wade3, implnet_sch_wade15, implnet_sch_wade5, implnet_sch_wade10, implnet_sch_wade14, implnet_sch_wade18, implnet_sch_wade13, implnet_sch_wade8, implnet_sch_wade19, implnet_sch_wade12, implnet_sch_wade4, implnet_sch_wade16, implnet_sch_wade1, implnet_sch_wade6, implnet_sch_wade11]
-
-
- return jobs + schedules
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py
deleted file mode 100644
index 015154da..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0
-
-@schedule(cron_schedule="0 8 24 * *", job=implnet_job_aiannh0, execution_timezone="US/Central")
-def implnet_sch_aiannh0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py
deleted file mode 100644
index 8a954b82..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_autotest10 import implnet_job_autotest10
-
-@schedule(cron_schedule="0 4 27 * *", job=implnet_job_autotest10, execution_timezone="US/Central")
-def implnet_sch_autotest10(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py
deleted file mode 100644
index 9a29f36c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_autotest20 import implnet_job_autotest20
-
-@schedule(cron_schedule="0 16 27 * *", job=implnet_job_autotest20, execution_timezone="US/Central")
-def implnet_sch_autotest20(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py
deleted file mode 100644
index e4db94d9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0
-
-@schedule(cron_schedule="0 4 21 * *", job=implnet_job_cagagespids0, execution_timezone="US/Central")
-def implnet_sch_cagagespids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py
deleted file mode 100644
index eda1610a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0
-
-@schedule(cron_schedule="0 4 22 * *", job=implnet_job_cbsa0, execution_timezone="US/Central")
-def implnet_sch_cbsa0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py
deleted file mode 100644
index d27a7587..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0
-
-@schedule(cron_schedule="0 20 26 * *", job=implnet_job_chyldpilotids0, execution_timezone="US/Central")
-def implnet_sch_chyldpilotids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py
deleted file mode 100644
index 5c4844e4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_counties0 import implnet_job_counties0
-
-@schedule(cron_schedule="0 12 21 * *", job=implnet_job_counties0, execution_timezone="US/Central")
-def implnet_sch_counties0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py
deleted file mode 100644
index 5e0b22a2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0
-
-@schedule(cron_schedule="0 8 16 * *", job=implnet_job_cuahsihisandrewsforestlterids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisandrewsforestlterids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py
deleted file mode 100644
index 3456f16b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0
-
-@schedule(cron_schedule="0 12 19 * *", job=implnet_job_cuahsihisbrazilucbids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisbrazilucbids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py
deleted file mode 100644
index 20ae9051..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0
-
-@schedule(cron_schedule="0 20 18 * *", job=implnet_job_cuahsihiscalvinhhsids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiscalvinhhsids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py
deleted file mode 100644
index 8e46588a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0
-
-@schedule(cron_schedule="0 20 9 * *", job=implnet_job_cuahsihisccbepdapids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisccbepdapids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py
deleted file mode 100644
index 36f484f4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0
-
-@schedule(cron_schedule="0 16 9 * *", job=implnet_job_cuahsihiscedarriverids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiscedarriverids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py
deleted file mode 100644
index f5ab8105..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0
-
-@schedule(cron_schedule="0 12 10 * *", job=implnet_job_cuahsihisclarksburgspids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisclarksburgspids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py
deleted file mode 100644
index 91519c57..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0
-
-@schedule(cron_schedule="0 16 14 * *", job=implnet_job_cuahsihiscocorahsids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiscocorahsids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py
deleted file mode 100644
index ea10a440..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0
-
-@schedule(cron_schedule="0 16 10 * *", job=implnet_job_cuahsihiscrwaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiscrwaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py
deleted file mode 100644
index ece86067..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0
-
-@schedule(cron_schedule="0 20 10 * *", job=implnet_job_cuahsihiscuisoids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiscuisoids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py
deleted file mode 100644
index bae4aa6e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0
-
-@schedule(cron_schedule="0 16 18 * *", job=implnet_job_cuahsihisczoarizids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczoarizids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py
deleted file mode 100644
index d9e166b1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0
-
-@schedule(cron_schedule="0 4 8 * *", job=implnet_job_cuahsihisczoboulderids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczoboulderids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py
deleted file mode 100644
index 25e2ff35..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0
-
-@schedule(cron_schedule="0 0 15 * *", job=implnet_job_cuahsihisczocatalinaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczocatalinaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py
deleted file mode 100644
index ee0b5b44..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0
-
-@schedule(cron_schedule="0 8 11 * *", job=implnet_job_cuahsihisczoluquilloids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczoluquilloids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py
deleted file mode 100644
index ae338e42..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0
-
-@schedule(cron_schedule="0 16 8 * *", job=implnet_job_cuahsihisczomercedids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczomercedids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py
deleted file mode 100644
index 23487d21..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0
-
-@schedule(cron_schedule="0 8 19 * *", job=implnet_job_cuahsihisczopsuids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczopsuids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py
deleted file mode 100644
index b512c419..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0
-
-@schedule(cron_schedule="0 4 12 * *", job=implnet_job_cuahsihisczoudelids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisczoudelids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py
deleted file mode 100644
index 008c9c6f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0
-
-@schedule(cron_schedule="0 16 17 * *", job=implnet_job_cuahsihisdrwiids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisdrwiids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py
deleted file mode 100644
index f85b0f22..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0
-
-@schedule(cron_schedule="0 8 12 * *", job=implnet_job_cuahsihisfarmrwaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisfarmrwaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py
deleted file mode 100644
index 9d8b942b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0
-
-@schedule(cron_schedule="0 12 18 * *", job=implnet_job_cuahsihisfcelterids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisfcelterids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py
deleted file mode 100644
index 78f83176..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0
-
-@schedule(cron_schedule="0 8 17 * *", job=implnet_job_cuahsihisfrcwqmids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisfrcwqmids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py
deleted file mode 100644
index 3bd64523..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0
-
-@schedule(cron_schedule="0 20 8 * *", job=implnet_job_cuahsihisghcnids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisghcnids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py
deleted file mode 100644
index b3c54291..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0
-
-@schedule(cron_schedule="0 8 18 * *", job=implnet_job_cuahsihisglacialridgeids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisglacialridgeids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py
deleted file mode 100644
index e6b3de9c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0
-
-@schedule(cron_schedule="0 16 19 * *", job=implnet_job_cuahsihisgleonauburnids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisgleonauburnids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py
deleted file mode 100644
index 356c75a3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0
-
-@schedule(cron_schedule="0 8 10 * *", job=implnet_job_cuahsihisgleondorsetids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisgleondorsetids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py
deleted file mode 100644
index 045a046b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0
-
-@schedule(cron_schedule="0 8 9 * *", job=implnet_job_cuahsihisgleonlakeannieids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisgleonlakeannieids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py
deleted file mode 100644
index 28b1ac9c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0
-
-@schedule(cron_schedule="0 12 13 * *", job=implnet_job_cuahsihisgleonsunapeeids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisgleonsunapeeids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py
deleted file mode 100644
index a74d69a5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0
-
-@schedule(cron_schedule="0 16 7 * *", job=implnet_job_cuahsihisglobalriversobservatoryids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisglobalriversobservatoryids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py
deleted file mode 100644
index d058b0b4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0
-
-@schedule(cron_schedule="0 20 20 * *", job=implnet_job_cuahsihisgonggaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisgonggaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py
deleted file mode 100644
index fcac8846..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0
-
-@schedule(cron_schedule="0 12 20 * *", job=implnet_job_cuahsihishassbergeids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihishassbergeids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py
deleted file mode 100644
index 2b580b0f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0
-
-@schedule(cron_schedule="0 12 17 * *", job=implnet_job_cuahsihishydrodataczdids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihishydrodataczdids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py
deleted file mode 100644
index 28bbab3d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0
-
-@schedule(cron_schedule="0 4 7 * *", job=implnet_job_cuahsihishydrodataczhrids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihishydrodataczhrids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py
deleted file mode 100644
index 02cbbb56..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0
-
-@schedule(cron_schedule="0 4 15 * *", job=implnet_job_cuahsihisieeratwilkesuniversityids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisieeratwilkesuniversityids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py
deleted file mode 100644
index f74db6cf..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0
-
-@schedule(cron_schedule="0 4 11 * *", job=implnet_job_cuahsihisirwaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisirwaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py
deleted file mode 100644
index d09f3907..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0
-
-@schedule(cron_schedule="0 0 20 * *", job=implnet_job_cuahsihisisbenaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisisbenaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py
deleted file mode 100644
index b676de2e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0
-
-@schedule(cron_schedule="0 0 10 * *", job=implnet_job_cuahsihiskansasweatherdataids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihiskansasweatherdataids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py
deleted file mode 100644
index cde52a52..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0
-
-@schedule(cron_schedule="0 20 19 * *", job=implnet_job_cuahsihislaselvastreamdischargeids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihislaselvastreamdischargeids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py
deleted file mode 100644
index e30ebcac..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0
-
-@schedule(cron_schedule="0 12 14 * *", job=implnet_job_cuahsihislczoodm2ids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihislczoodm2ids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py
deleted file mode 100644
index dc0578ff..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0
-
-@schedule(cron_schedule="0 16 16 * *", job=implnet_job_cuahsihislittlebearriverids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihislittlebearriverids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py
deleted file mode 100644
index 0ad075c0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0
-
-@schedule(cron_schedule="0 12 16 * *", job=implnet_job_cuahsihisloganrivergamutids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisloganrivergamutids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py
deleted file mode 100644
index 6af511f8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0
-
-@schedule(cron_schedule="0 16 15 * *", job=implnet_job_cuahsihisloganriverids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisloganriverids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py
deleted file mode 100644
index 6b7aa5e6..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0
-
-@schedule(cron_schedule="0 20 16 * *", job=implnet_job_cuahsihislterntlwoodruffids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihislterntlwoodruffids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py
deleted file mode 100644
index 491cff9c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0
-
-@schedule(cron_schedule="0 12 9 * *", job=implnet_job_cuahsihisluwlids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisluwlids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py
deleted file mode 100644
index 201e8a1c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0
-
-@schedule(cron_schedule="0 0 14 * *", job=implnet_job_cuahsihismaaeriids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismaaeriids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py
deleted file mode 100644
index a087e02f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0
-
-@schedule(cron_schedule="0 8 13 * *", job=implnet_job_cuahsihismazarriverprojectids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismazarriverprojectids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py
deleted file mode 100644
index fc42e187..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0
-
-@schedule(cron_schedule="0 0 9 * *", job=implnet_job_cuahsihismmaatacamaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismmaatacamaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py
deleted file mode 100644
index 8b415d15..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0
-
-@schedule(cron_schedule="0 4 16 * *", job=implnet_job_cuahsihismobilecrowdhydrologyids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismobilecrowdhydrologyids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py
deleted file mode 100644
index 148d4f7a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0
-
-@schedule(cron_schedule="0 0 21 * *", job=implnet_job_cuahsihismopexids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismopexids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py
deleted file mode 100644
index 94a02c32..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0
-
-@schedule(cron_schedule="0 12 8 * *", job=implnet_job_cuahsihismuddyriverids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismuddyriverids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py
deleted file mode 100644
index d791ae51..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0
-
-@schedule(cron_schedule="0 8 15 * *", job=implnet_job_cuahsihismudlakeids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismudlakeids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py
deleted file mode 100644
index 1ac8f637..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0
-
-@schedule(cron_schedule="0 12 15 * *", job=implnet_job_cuahsihismwdisids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismwdisids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py
deleted file mode 100644
index e0b5f4bb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0
-
-@schedule(cron_schedule="0 20 13 * *", job=implnet_job_cuahsihismwraids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihismwraids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py
deleted file mode 100644
index 11f0da34..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0
-
-@schedule(cron_schedule="0 0 16 * *", job=implnet_job_cuahsihisnashrwaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnashrwaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py
deleted file mode 100644
index 857f66f5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0
-
-@schedule(cron_schedule="0 4 14 * *", job=implnet_job_cuahsihisnceiww2ids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnceiww2ids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py
deleted file mode 100644
index 3ad1b8e5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0
-
-@schedule(cron_schedule="0 12 7 * *", job=implnet_job_cuahsihisneonids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisneonids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py
deleted file mode 100644
index d202dbfe..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0
-
-@schedule(cron_schedule="0 0 13 * *", job=implnet_job_cuahsihisnevadosids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnevadosids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py
deleted file mode 100644
index 92b13937..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0
-
-@schedule(cron_schedule="0 4 19 * *", job=implnet_job_cuahsihisnevcanids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnevcanids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py
deleted file mode 100644
index 40a3a618..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0
-
-@schedule(cron_schedule="0 0 12 * *", job=implnet_job_cuahsihisnewnids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnewnids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py
deleted file mode 100644
index a739aa88..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0
-
-@schedule(cron_schedule="0 16 20 * *", job=implnet_job_cuahsihisnhgswofids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnhgswofids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py
deleted file mode 100644
index 1d5d40d2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0
-
-@schedule(cron_schedule="0 8 7 * *", job=implnet_job_cuahsihisnooksackmicroclimatenetworkids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisnooksackmicroclimatenetworkids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py
deleted file mode 100644
index 27ca639f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0
-
-@schedule(cron_schedule="0 4 10 * *", job=implnet_job_cuahsihisodmkentstateids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisodmkentstateids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py
deleted file mode 100644
index 92379894..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0
-
-@schedule(cron_schedule="0 16 13 * *", job=implnet_job_cuahsihisorsancohabids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisorsancohabids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py
deleted file mode 100644
index b104ead5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0
-
-@schedule(cron_schedule="0 20 11 * *", job=implnet_job_cuahsihispanolaodmids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihispanolaodmids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py
deleted file mode 100644
index 9be4e39b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0
-
-@schedule(cron_schedule="0 20 14 * *", job=implnet_job_cuahsihisparalanaturalezaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisparalanaturalezaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py
deleted file mode 100644
index dd9a83b0..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0
-
-@schedule(cron_schedule="0 0 11 * *", job=implnet_job_cuahsihisprovorivergamutids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisprovorivergamutids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py
deleted file mode 100644
index b92891a3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0
-
-@schedule(cron_schedule="0 4 18 * *", job=implnet_job_cuahsihisredbuttecreekgamutids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisredbuttecreekgamutids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py
deleted file mode 100644
index 8331350a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0
-
-@schedule(cron_schedule="0 16 11 * *", job=implnet_job_cuahsihisrmblids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisrmblids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py
deleted file mode 100644
index f2439e2f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0
-
-@schedule(cron_schedule="0 0 17 * *", job=implnet_job_cuahsihissagehencreekids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihissagehencreekids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py
deleted file mode 100644
index c9e7f9e5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0
-
-@schedule(cron_schedule="0 20 15 * *", job=implnet_job_cuahsihisscanids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisscanids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py
deleted file mode 100644
index bca44679..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0
-
-@schedule(cron_schedule="0 0 8 * *", job=implnet_job_cuahsihisscotlandnwisids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisscotlandnwisids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py
deleted file mode 100644
index 8bf790c8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0
-
-@schedule(cron_schedule="0 20 12 * *", job=implnet_job_cuahsihisshalenetworkodmids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisshalenetworkodmids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py
deleted file mode 100644
index eef4e1a4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1
-
-@schedule(cron_schedule="0 4 17 * *", job=implnet_job_cuahsihisshalenetworkodmids1, execution_timezone="US/Central")
-def implnet_sch_cuahsihisshalenetworkodmids1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py
deleted file mode 100644
index 5508cf3c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0
-
-@schedule(cron_schedule="0 12 12 * *", job=implnet_job_cuahsihisskcmilltownids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisskcmilltownids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py
deleted file mode 100644
index 3fb8e587..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0
-
-@schedule(cron_schedule="0 0 19 * *", job=implnet_job_cuahsihissnotelids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihissnotelids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py
deleted file mode 100644
index 5db961b1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0
-
-@schedule(cron_schedule="0 4 20 * *", job=implnet_job_cuahsihisswedishmonitoringdataids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisswedishmonitoringdataids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py
deleted file mode 100644
index 7693e39f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0
-
-@schedule(cron_schedule="0 8 14 * *", job=implnet_job_cuahsihistarlandwaterqualityids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihistarlandwaterqualityids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py
deleted file mode 100644
index ffdd4638..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0
-
-@schedule(cron_schedule="0 20 7 * *", job=implnet_job_cuahsihistncwaterdataids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihistncwaterdataids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py
deleted file mode 100644
index 96e57ff1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0
-
-@schedule(cron_schedule="0 0 18 * *", job=implnet_job_cuahsihistrwaids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihistrwaids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py
deleted file mode 100644
index e32d57ba..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0
-
-@schedule(cron_schedule="0 12 11 * *", job=implnet_job_cuahsihistuolumnemdwids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihistuolumnemdwids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py
deleted file mode 100644
index f34b68cd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0
-
-@schedule(cron_schedule="0 20 17 * *", job=implnet_job_cuahsihisubwpadids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisubwpadids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py
deleted file mode 100644
index d18eed4f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0
-
-@schedule(cron_schedule="0 16 12 * *", job=implnet_job_cuahsihisumbcgwids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisumbcgwids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py
deleted file mode 100644
index 34184964..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0
-
-@schedule(cron_schedule="0 4 9 * *", job=implnet_job_cuahsihisumbcwqids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisumbcwqids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py
deleted file mode 100644
index 95d0853f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0
-
-@schedule(cron_schedule="0 8 20 * *", job=implnet_job_cuahsihisunhsnowids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisunhsnowids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py
deleted file mode 100644
index 86abbb12..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0
-
-@schedule(cron_schedule="0 4 13 * *", job=implnet_job_cuahsihisweiherbachids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisweiherbachids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py
deleted file mode 100644
index 60eec163..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0
-
-@schedule(cron_schedule="0 8 8 * *", job=implnet_job_cuahsihisyosemitehydroclimatenetworkids0, execution_timezone="US/Central")
-def implnet_sch_cuahsihisyosemitehydroclimatenetworkids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py
deleted file mode 100644
index 4e1dac0f..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_dams0 import implnet_job_dams0
-
-@schedule(cron_schedule="0 12 23 * *", job=implnet_job_dams0, execution_timezone="US/Central")
-def implnet_sch_dams0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py
deleted file mode 100644
index d9257d08..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_dams1 import implnet_job_dams1
-
-@schedule(cron_schedule="0 16 23 * *", job=implnet_job_dams1, execution_timezone="US/Central")
-def implnet_sch_dams1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py
deleted file mode 100644
index 864b041c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_damspids0 import implnet_job_damspids0
-
-@schedule(cron_schedule="0 0 7 * *", job=implnet_job_damspids0, execution_timezone="US/Central")
-def implnet_sch_damspids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py
deleted file mode 100644
index 85ea1e34..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_demo0 import implnet_job_demo0
-
-@schedule(cron_schedule="0 12 27 * *", job=implnet_job_demo0, execution_timezone="US/Central")
-def implnet_sch_demo0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py
deleted file mode 100644
index ced09e2d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0
-
-@schedule(cron_schedule="0 16 6 * *", job=implnet_job_gfv11pois0, execution_timezone="US/Central")
-def implnet_sch_gfv11pois0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py
deleted file mode 100644
index 0dd95b15..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1
-
-@schedule(cron_schedule="0 12 6 * *", job=implnet_job_gfv11pois1, execution_timezone="US/Central")
-def implnet_sch_gfv11pois1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py
deleted file mode 100644
index 150d8862..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hmw0 import implnet_job_hmw0
-
-@schedule(cron_schedule="0 0 25 * *", job=implnet_job_hmw0, execution_timezone="US/Central")
-def implnet_sch_hmw0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py
deleted file mode 100644
index ab65446c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hmw1 import implnet_job_hmw1
-
-@schedule(cron_schedule="0 4 25 * *", job=implnet_job_hmw1, execution_timezone="US/Central")
-def implnet_sch_hmw1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py
deleted file mode 100644
index 0bbb1e18..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hu020 import implnet_job_hu020
-
-@schedule(cron_schedule="0 12 24 * *", job=implnet_job_hu020, execution_timezone="US/Central")
-def implnet_sch_hu020(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py
deleted file mode 100644
index 116970d9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hu040 import implnet_job_hu040
-
-@schedule(cron_schedule="0 12 22 * *", job=implnet_job_hu040, execution_timezone="US/Central")
-def implnet_sch_hu040(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py
deleted file mode 100644
index 25998aa2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hu060 import implnet_job_hu060
-
-@schedule(cron_schedule="0 20 21 * *", job=implnet_job_hu060, execution_timezone="US/Central")
-def implnet_sch_hu060(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py
deleted file mode 100644
index 4475cf5d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hu080 import implnet_job_hu080
-
-@schedule(cron_schedule="0 8 22 * *", job=implnet_job_hu080, execution_timezone="US/Central")
-def implnet_sch_hu080(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py
deleted file mode 100644
index e358d5a2..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hu100 import implnet_job_hu100
-
-@schedule(cron_schedule="0 4 24 * *", job=implnet_job_hu100, execution_timezone="US/Central")
-def implnet_sch_hu100(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py
deleted file mode 100644
index 5bfa5a67..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0
-
-@schedule(cron_schedule="0 8 25 * *", job=implnet_job_huc12pp0, execution_timezone="US/Central")
-def implnet_sch_huc12pp0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py
deleted file mode 100644
index c1e6c81c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1
-
-@schedule(cron_schedule="0 12 25 * *", job=implnet_job_huc12pp1, execution_timezone="US/Central")
-def implnet_sch_huc12pp1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py
deleted file mode 100644
index babd25af..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0
-
-@schedule(cron_schedule="0 20 6 * *", job=implnet_job_hydrologicunit0, execution_timezone="US/Central")
-def implnet_sch_hydrologicunit0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py
deleted file mode 100644
index adca26f5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_links0 import implnet_job_links0
-
-@schedule(cron_schedule="0 8 27 * *", job=implnet_job_links0, execution_timezone="US/Central")
-def implnet_sch_links0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py
deleted file mode 100644
index 95b80cee..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0
-
-@schedule(cron_schedule="0 16 24 * *", job=implnet_job_mainstems0, execution_timezone="US/Central")
-def implnet_sch_mainstems0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py
deleted file mode 100644
index c9c7c350..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nataq0 import implnet_job_nataq0
-
-@schedule(cron_schedule="0 0 22 * *", job=implnet_job_nataq0, execution_timezone="US/Central")
-def implnet_sch_nataq0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py
deleted file mode 100644
index 0e14317c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0
-
-@schedule(cron_schedule="0 0 26 * *", job=implnet_job_nmwdiose0, execution_timezone="US/Central")
-def implnet_sch_nmwdiose0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py
deleted file mode 100644
index d5a706c9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1
-
-@schedule(cron_schedule="0 8 26 * *", job=implnet_job_nmwdiose1, execution_timezone="US/Central")
-def implnet_sch_nmwdiose1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py
deleted file mode 100644
index eee08799..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2
-
-@schedule(cron_schedule="0 20 25 * *", job=implnet_job_nmwdiose2, execution_timezone="US/Central")
-def implnet_sch_nmwdiose2(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py
deleted file mode 100644
index 81a39e99..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3
-
-@schedule(cron_schedule="0 16 25 * *", job=implnet_job_nmwdiose3, execution_timezone="US/Central")
-def implnet_sch_nmwdiose3(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py
deleted file mode 100644
index ae6c0a4a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4
-
-@schedule(cron_schedule="0 4 26 * *", job=implnet_job_nmwdiose4, execution_timezone="US/Central")
-def implnet_sch_nmwdiose4(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py
deleted file mode 100644
index 8cc30594..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0
-
-@schedule(cron_schedule="0 12 26 * *", job=implnet_job_nmwdist0, execution_timezone="US/Central")
-def implnet_sch_nmwdist0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py
deleted file mode 100644
index a707538a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0
-
-@schedule(cron_schedule="0 16 5 * *", job=implnet_job_nwisgw0, execution_timezone="US/Central")
-def implnet_sch_nwisgw0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py
deleted file mode 100644
index 3e6826c5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1
-
-@schedule(cron_schedule="0 8 4 * *", job=implnet_job_nwisgw1, execution_timezone="US/Central")
-def implnet_sch_nwisgw1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py
deleted file mode 100644
index 41ab4c13..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10
-
-@schedule(cron_schedule="0 4 2 * *", job=implnet_job_nwisgw10, execution_timezone="US/Central")
-def implnet_sch_nwisgw10(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py
deleted file mode 100644
index 3ce3fc53..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11
-
-@schedule(cron_schedule="0 12 5 * *", job=implnet_job_nwisgw11, execution_timezone="US/Central")
-def implnet_sch_nwisgw11(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py
deleted file mode 100644
index 56f993ba..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12
-
-@schedule(cron_schedule="0 12 1 * *", job=implnet_job_nwisgw12, execution_timezone="US/Central")
-def implnet_sch_nwisgw12(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py
deleted file mode 100644
index 2b5d4a44..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13
-
-@schedule(cron_schedule="0 16 3 * *", job=implnet_job_nwisgw13, execution_timezone="US/Central")
-def implnet_sch_nwisgw13(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py
deleted file mode 100644
index 3d6496e9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14
-
-@schedule(cron_schedule="0 20 1 * *", job=implnet_job_nwisgw14, execution_timezone="US/Central")
-def implnet_sch_nwisgw14(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py
deleted file mode 100644
index 509bcc69..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15
-
-@schedule(cron_schedule="0 8 2 * *", job=implnet_job_nwisgw15, execution_timezone="US/Central")
-def implnet_sch_nwisgw15(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py
deleted file mode 100644
index ebc73ed8..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16
-
-@schedule(cron_schedule="0 8 1 * *", job=implnet_job_nwisgw16, execution_timezone="US/Central")
-def implnet_sch_nwisgw16(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py
deleted file mode 100644
index f495f454..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17
-
-@schedule(cron_schedule="0 0 5 * *", job=implnet_job_nwisgw17, execution_timezone="US/Central")
-def implnet_sch_nwisgw17(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py
deleted file mode 100644
index b07ae31a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18
-
-@schedule(cron_schedule="0 4 5 * *", job=implnet_job_nwisgw18, execution_timezone="US/Central")
-def implnet_sch_nwisgw18(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py
deleted file mode 100644
index edcb74fd..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19
-
-@schedule(cron_schedule="0 0 3 * *", job=implnet_job_nwisgw19, execution_timezone="US/Central")
-def implnet_sch_nwisgw19(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py
deleted file mode 100644
index 7abba808..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2
-
-@schedule(cron_schedule="0 12 2 * *", job=implnet_job_nwisgw2, execution_timezone="US/Central")
-def implnet_sch_nwisgw2(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py
deleted file mode 100644
index ca0531d9..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20
-
-@schedule(cron_schedule="0 0 1 * *", job=implnet_job_nwisgw20, execution_timezone="US/Central")
-def implnet_sch_nwisgw20(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py
deleted file mode 100644
index e1067dcb..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21
-
-@schedule(cron_schedule="0 12 4 * *", job=implnet_job_nwisgw21, execution_timezone="US/Central")
-def implnet_sch_nwisgw21(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py
deleted file mode 100644
index 6db0b849..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22
-
-@schedule(cron_schedule="0 4 1 * *", job=implnet_job_nwisgw22, execution_timezone="US/Central")
-def implnet_sch_nwisgw22(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py
deleted file mode 100644
index 0f821b3b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23
-
-@schedule(cron_schedule="0 0 2 * *", job=implnet_job_nwisgw23, execution_timezone="US/Central")
-def implnet_sch_nwisgw23(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py
deleted file mode 100644
index 0980e281..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24
-
-@schedule(cron_schedule="0 16 2 * *", job=implnet_job_nwisgw24, execution_timezone="US/Central")
-def implnet_sch_nwisgw24(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py
deleted file mode 100644
index be96a0d1..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25
-
-@schedule(cron_schedule="0 16 1 * *", job=implnet_job_nwisgw25, execution_timezone="US/Central")
-def implnet_sch_nwisgw25(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py
deleted file mode 100644
index fd2da44a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26
-
-@schedule(cron_schedule="0 8 3 * *", job=implnet_job_nwisgw26, execution_timezone="US/Central")
-def implnet_sch_nwisgw26(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py
deleted file mode 100644
index 344a3223..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27
-
-@schedule(cron_schedule="0 16 4 * *", job=implnet_job_nwisgw27, execution_timezone="US/Central")
-def implnet_sch_nwisgw27(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py
deleted file mode 100644
index f3eadc99..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28
-
-@schedule(cron_schedule="0 4 3 * *", job=implnet_job_nwisgw28, execution_timezone="US/Central")
-def implnet_sch_nwisgw28(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py
deleted file mode 100644
index 9d918109..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3
-
-@schedule(cron_schedule="0 0 4 * *", job=implnet_job_nwisgw3, execution_timezone="US/Central")
-def implnet_sch_nwisgw3(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py
deleted file mode 100644
index b1a0ceee..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4
-
-@schedule(cron_schedule="0 4 4 * *", job=implnet_job_nwisgw4, execution_timezone="US/Central")
-def implnet_sch_nwisgw4(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py
deleted file mode 100644
index 3900514a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5
-
-@schedule(cron_schedule="0 12 3 * *", job=implnet_job_nwisgw5, execution_timezone="US/Central")
-def implnet_sch_nwisgw5(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py
deleted file mode 100644
index addbec36..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6
-
-@schedule(cron_schedule="0 20 3 * *", job=implnet_job_nwisgw6, execution_timezone="US/Central")
-def implnet_sch_nwisgw6(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py
deleted file mode 100644
index 0f8e4348..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7
-
-@schedule(cron_schedule="0 8 5 * *", job=implnet_job_nwisgw7, execution_timezone="US/Central")
-def implnet_sch_nwisgw7(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py
deleted file mode 100644
index 4b45d89e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8
-
-@schedule(cron_schedule="0 20 4 * *", job=implnet_job_nwisgw8, execution_timezone="US/Central")
-def implnet_sch_nwisgw8(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py
deleted file mode 100644
index f0090f77..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9
-
-@schedule(cron_schedule="0 20 2 * *", job=implnet_job_nwisgw9, execution_timezone="US/Central")
-def implnet_sch_nwisgw9(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py
deleted file mode 100644
index 6d155641..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0
-
-@schedule(cron_schedule="0 4 6 * *", job=implnet_job_nwissite0, execution_timezone="US/Central")
-def implnet_sch_nwissite0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py
deleted file mode 100644
index 9045a707..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1
-
-@schedule(cron_schedule="0 20 5 * *", job=implnet_job_nwissite1, execution_timezone="US/Central")
-def implnet_sch_nwissite1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py
deleted file mode 100644
index eff433c4..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2
-
-@schedule(cron_schedule="0 8 6 * *", job=implnet_job_nwissite2, execution_timezone="US/Central")
-def implnet_sch_nwissite2(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py
deleted file mode 100644
index e8a88282..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3
-
-@schedule(cron_schedule="0 0 6 * *", job=implnet_job_nwissite3, execution_timezone="US/Central")
-def implnet_sch_nwissite3(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py
deleted file mode 100644
index 0a0ad632..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_places0 import implnet_job_places0
-
-@schedule(cron_schedule="0 20 24 * *", job=implnet_job_places0, execution_timezone="US/Central")
-def implnet_sch_places0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py
deleted file mode 100644
index e33474ce..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0
-
-@schedule(cron_schedule="0 16 22 * *", job=implnet_job_princiaq0, execution_timezone="US/Central")
-def implnet_sch_princiaq0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py
deleted file mode 100644
index 8defec9d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_pws0 import implnet_job_pws0
-
-@schedule(cron_schedule="0 16 21 * *", job=implnet_job_pws0, execution_timezone="US/Central")
-def implnet_sch_pws0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py
deleted file mode 100644
index 5dbcb2ae..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_refgage0 import implnet_job_refgage0
-
-@schedule(cron_schedule="0 20 22 * *", job=implnet_job_refgage0, execution_timezone="US/Central")
-def implnet_sch_refgage0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py
deleted file mode 100644
index 30e8c638..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_refgage1 import implnet_job_refgage1
-
-@schedule(cron_schedule="0 8 23 * *", job=implnet_job_refgage1, execution_timezone="US/Central")
-def implnet_sch_refgage1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py
deleted file mode 100644
index 7038c026..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_refgage2 import implnet_job_refgage2
-
-@schedule(cron_schedule="0 4 23 * *", job=implnet_job_refgage2, execution_timezone="US/Central")
-def implnet_sch_refgage2(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py
deleted file mode 100644
index 106951da..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_refgage3 import implnet_job_refgage3
-
-@schedule(cron_schedule="0 0 23 * *", job=implnet_job_refgage3, execution_timezone="US/Central")
-def implnet_sch_refgage3(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py
deleted file mode 100644
index 4efb4c9d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_rise0 import implnet_job_rise0
-
-@schedule(cron_schedule="0 0 27 * *", job=implnet_job_rise0, execution_timezone="US/Central")
-def implnet_sch_rise0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py
deleted file mode 100644
index 64355867..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0
-
-@schedule(cron_schedule="0 8 21 * *", job=implnet_job_sechydrgreg0, execution_timezone="US/Central")
-def implnet_sch_sechydrgreg0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py
deleted file mode 100644
index b5c76804..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0
-
-@schedule(cron_schedule="0 16 26 * *", job=implnet_job_selfieids0, execution_timezone="US/Central")
-def implnet_sch_selfieids0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py
deleted file mode 100644
index ab653cb3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_states0 import implnet_job_states0
-
-@schedule(cron_schedule="0 0 24 * *", job=implnet_job_states0, execution_timezone="US/Central")
-def implnet_sch_states0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py
deleted file mode 100644
index 07d3c80d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ua100 import implnet_job_ua100
-
-@schedule(cron_schedule="0 20 23 * *", job=implnet_job_ua100, execution_timezone="US/Central")
-def implnet_sch_ua100(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py
deleted file mode 100644
index 8cac8291..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade0 import implnet_job_wade0
-
-@schedule(cron_schedule="0 0 1 * *", job=implnet_job_wade0, execution_timezone="US/Central")
-def implnet_sch_wade0(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py
deleted file mode 100644
index 71a6dd1a..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade1 import implnet_job_wade1
-
-@schedule(cron_schedule="0 16 3 * *", job=implnet_job_wade1, execution_timezone="US/Central")
-def implnet_sch_wade1(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py
deleted file mode 100644
index 9c25907e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade10 import implnet_job_wade10
-
-@schedule(cron_schedule="0 4 2 * *", job=implnet_job_wade10, execution_timezone="US/Central")
-def implnet_sch_wade10(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py
deleted file mode 100644
index 27d16cbf..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade11 import implnet_job_wade11
-
-@schedule(cron_schedule="0 0 4 * *", job=implnet_job_wade11, execution_timezone="US/Central")
-def implnet_sch_wade11(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py
deleted file mode 100644
index d0857964..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade12 import implnet_job_wade12
-
-@schedule(cron_schedule="0 4 3 * *", job=implnet_job_wade12, execution_timezone="US/Central")
-def implnet_sch_wade12(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py
deleted file mode 100644
index a148b22d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade13 import implnet_job_wade13
-
-@schedule(cron_schedule="0 16 2 * *", job=implnet_job_wade13, execution_timezone="US/Central")
-def implnet_sch_wade13(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py
deleted file mode 100644
index f958c5ca..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade14 import implnet_job_wade14
-
-@schedule(cron_schedule="0 8 2 * *", job=implnet_job_wade14, execution_timezone="US/Central")
-def implnet_sch_wade14(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py
deleted file mode 100644
index ac28e77b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade15 import implnet_job_wade15
-
-@schedule(cron_schedule="0 20 1 * *", job=implnet_job_wade15, execution_timezone="US/Central")
-def implnet_sch_wade15(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py
deleted file mode 100644
index 811d3d65..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade16 import implnet_job_wade16
-
-@schedule(cron_schedule="0 12 3 * *", job=implnet_job_wade16, execution_timezone="US/Central")
-def implnet_sch_wade16(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py
deleted file mode 100644
index a54ef062..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade17 import implnet_job_wade17
-
-@schedule(cron_schedule="0 4 1 * *", job=implnet_job_wade17, execution_timezone="US/Central")
-def implnet_sch_wade17(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py
deleted file mode 100644
index 70b2eadc..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade18 import implnet_job_wade18
-
-@schedule(cron_schedule="0 12 2 * *", job=implnet_job_wade18, execution_timezone="US/Central")
-def implnet_sch_wade18(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py
deleted file mode 100644
index 978d2f6e..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade19 import implnet_job_wade19
-
-@schedule(cron_schedule="0 0 3 * *", job=implnet_job_wade19, execution_timezone="US/Central")
-def implnet_sch_wade19(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py
deleted file mode 100644
index f41ffb68..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade2 import implnet_job_wade2
-
-@schedule(cron_schedule="0 20 27 * *", job=implnet_job_wade2, execution_timezone="US/Central")
-def implnet_sch_wade2(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py
deleted file mode 100644
index 78c852f3..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade3 import implnet_job_wade3
-
-@schedule(cron_schedule="0 16 1 * *", job=implnet_job_wade3, execution_timezone="US/Central")
-def implnet_sch_wade3(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py
deleted file mode 100644
index 469a0ea5..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade4 import implnet_job_wade4
-
-@schedule(cron_schedule="0 8 3 * *", job=implnet_job_wade4, execution_timezone="US/Central")
-def implnet_sch_wade4(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py
deleted file mode 100644
index d64c68c7..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade5 import implnet_job_wade5
-
-@schedule(cron_schedule="0 0 2 * *", job=implnet_job_wade5, execution_timezone="US/Central")
-def implnet_sch_wade5(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py
deleted file mode 100644
index d6049d0c..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade6 import implnet_job_wade6
-
-@schedule(cron_schedule="0 20 3 * *", job=implnet_job_wade6, execution_timezone="US/Central")
-def implnet_sch_wade6(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py
deleted file mode 100644
index 2d6c8681..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade7 import implnet_job_wade7
-
-@schedule(cron_schedule="0 12 1 * *", job=implnet_job_wade7, execution_timezone="US/Central")
-def implnet_sch_wade7(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py
deleted file mode 100644
index de64874b..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade8 import implnet_job_wade8
-
-@schedule(cron_schedule="0 20 2 * *", job=implnet_job_wade8, execution_timezone="US/Central")
-def implnet_sch_wade8(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py
deleted file mode 100644
index 2188da6d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wade9 import implnet_job_wade9
-
-@schedule(cron_schedule="0 8 1 * *", job=implnet_job_wade9, execution_timezone="US/Central")
-def implnet_sch_wade9(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml
deleted file mode 100644
index 54490e1d..00000000
--- a/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-load_from:
- - python_file:
- relative_path: "repositories/repository.py"
- working_directory: .
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py
deleted file mode 100644
index 48184735..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_abacus import harvest_abacus
-
-@job
-def implnet_job_abacus():
- harvest_abacus()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py
deleted file mode 100644
index 721894c6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_acss import harvest_acss
-
-@job
-def implnet_job_acss():
- harvest_acss()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py
deleted file mode 100644
index e63e412d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_adf import harvest_adf
-
-@job
-def implnet_job_adf():
- harvest_adf()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py
deleted file mode 100644
index 7b074a93..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_arecibo import harvest_arecibo
-
-@job
-def implnet_job_arecibo():
- harvest_arecibo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py
deleted file mode 100644
index 1be0888b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_asulrdr import harvest_asulrdr
-
-@job
-def implnet_job_asulrdr():
- harvest_asulrdr()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py
deleted file mode 100644
index e832e112..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_aussda import harvest_aussda
-
-@job
-def implnet_job_aussda():
- harvest_aussda()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py
deleted file mode 100644
index c3ed2f31..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_aws import harvest_aws
-
-@job
-def implnet_job_aws():
- harvest_aws()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py
deleted file mode 100644
index 70854a55..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_borealis import harvest_borealis
-
-@job
-def implnet_job_borealis():
- harvest_borealis()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py
deleted file mode 100644
index c997fbe7..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_chile import harvest_chile
-
-@job
-def implnet_job_chile():
- harvest_chile()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py
deleted file mode 100644
index 811c5a10..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cifor import harvest_cifor
-
-@job
-def implnet_job_cifor():
- harvest_cifor()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py
deleted file mode 100644
index e7cd6da5..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cimmyt import harvest_cimmyt
-
-@job
-def implnet_job_cimmyt():
- harvest_cimmyt()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py
deleted file mode 100644
index 62a6b827..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cora import harvest_cora
-
-@job
-def implnet_job_cora():
- harvest_cora()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py
deleted file mode 100644
index 11e92d10..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_crossda import harvest_crossda
-
-@job
-def implnet_job_crossda():
- harvest_crossda()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py
deleted file mode 100644
index 1657e65a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cuhk import harvest_cuhk
-
-@job
-def implnet_job_cuhk():
- harvest_cuhk()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py
deleted file mode 100644
index 05c83045..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cyvers import harvest_cyvers
-
-@job
-def implnet_job_cyvers():
- harvest_cyvers()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py
deleted file mode 100644
index 45227306..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_darus import harvest_darus
-
-@job
-def implnet_job_darus():
- harvest_darus()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py
deleted file mode 100644
index 2fd2b3da..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_drp import harvest_drp
-
-@job
-def implnet_job_drp():
- harvest_drp()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py
deleted file mode 100644
index aa6f9ba4..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_dryad import harvest_dryad
-
-@job
-def implnet_job_dryad():
- harvest_dryad()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py
deleted file mode 100644
index f3a36e6b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_edatos import harvest_edatos
-
-@job
-def implnet_job_edatos():
- harvest_edatos()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py
deleted file mode 100644
index 0d97b5f6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_fiu import harvest_fiu
-
-@job
-def implnet_job_fiu():
- harvest_fiu()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py
deleted file mode 100644
index 6d887034..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_gro import harvest_gro
-
-@job
-def implnet_job_gro():
- harvest_gro()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py
deleted file mode 100644
index 064ab7bb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_harvard import harvest_harvard
-
-@job
-def implnet_job_harvard():
- harvest_harvard()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py
deleted file mode 100644
index 27897b7a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hopkins import harvest_hopkins
-
-@job
-def implnet_job_hopkins():
- harvest_hopkins()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py
deleted file mode 100644
index 9b137417..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_hord import harvest_hord
-
-@job
-def implnet_job_hord():
- harvest_hord()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py
deleted file mode 100644
index 0a619c63..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ibict import harvest_ibict
-
-@job
-def implnet_job_ibict():
- harvest_ibict()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py
deleted file mode 100644
index df6ee83d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_icarda import harvest_icarda
-
-@job
-def implnet_job_icarda():
- harvest_icarda()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py
deleted file mode 100644
index ff268ab7..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_icrisat import harvest_icrisat
-
-@job
-def implnet_job_icrisat():
- harvest_icrisat()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py
deleted file mode 100644
index 44f0526e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ifdc import harvest_ifdc
-
-@job
-def implnet_job_ifdc():
- harvest_ifdc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py
deleted file mode 100644
index 5a95c4b1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ifsttar import harvest_ifsttar
-
-@job
-def implnet_job_ifsttar():
- harvest_ifsttar()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py
deleted file mode 100644
index 25b3d7af..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_iisg import harvest_iisg
-
-@job
-def implnet_job_iisg():
- harvest_iisg()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py
deleted file mode 100644
index 7bb32b67..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_iit import harvest_iit
-
-@job
-def implnet_job_iit():
- harvest_iit()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py
deleted file mode 100644
index da470ecf..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ipc import harvest_ipc
-
-@job
-def implnet_job_ipc():
- harvest_ipc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py
deleted file mode 100644
index e47f7f2b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_irl import harvest_irl
-
-@job
-def implnet_job_irl():
- harvest_irl()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py
deleted file mode 100644
index f8b1a08d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_irs import harvest_irs
-
-@job
-def implnet_job_irs():
- harvest_irs()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py
deleted file mode 100644
index 5d0a29f5..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_julich import harvest_julich
-
-@job
-def implnet_job_julich():
- harvest_julich()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py
deleted file mode 100644
index f0a0e01b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_lida import harvest_lida
-
-@job
-def implnet_job_lida():
- harvest_lida()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py
deleted file mode 100644
index 9d5e3ad4..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_manitoba import harvest_manitoba
-
-@job
-def implnet_job_manitoba():
- harvest_manitoba()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py
deleted file mode 100644
index 004f4a00..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_matcommons import harvest_matcommons
-
-@job
-def implnet_job_matcommons():
- harvest_matcommons()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py
deleted file mode 100644
index dfb76466..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_mdf import harvest_mdf
-
-@job
-def implnet_job_mdf():
- harvest_mdf()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py
deleted file mode 100644
index 70161a3b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_milano import harvest_milano
-
-@job
-def implnet_job_milano():
- harvest_milano()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py
deleted file mode 100644
index 403d9f90..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_neon import harvest_neon
-
-@job
-def implnet_job_neon():
- harvest_neon()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py
deleted file mode 100644
index 766266df..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_netherland import harvest_netherland
-
-@job
-def implnet_job_netherland():
- harvest_netherland()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py
deleted file mode 100644
index a1ddf3f9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_nioz import harvest_nioz
-
-@job
-def implnet_job_nioz():
- harvest_nioz()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py
deleted file mode 100644
index 25a39907..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_norway import harvest_norway
-
-@job
-def implnet_job_norway():
- harvest_norway()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py
deleted file mode 100644
index e7770f47..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ntu import harvest_ntu
-
-@job
-def implnet_job_ntu():
- harvest_ntu()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py
deleted file mode 100644
index 0ffab294..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ofd import harvest_ofd
-
-@job
-def implnet_job_ofd():
- harvest_ofd()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py
deleted file mode 100644
index 753ee08f..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_peking import harvest_peking
-
-@job
-def implnet_job_peking():
- harvest_peking()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py
deleted file mode 100644
index 79aae9e5..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_pesquisa import harvest_pesquisa
-
-@job
-def implnet_job_pesquisa():
- harvest_pesquisa()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py
deleted file mode 100644
index 0047c17a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_pucdp import harvest_pucdp
-
-@job
-def implnet_job_pucdp():
- harvest_pucdp()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py
deleted file mode 100644
index 464834ff..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_qdr import harvest_qdr
-
-@job
-def implnet_job_qdr():
- harvest_qdr()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py
deleted file mode 100644
index 6332771f..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_rin import harvest_rin
-
-@job
-def implnet_job_rin():
- harvest_rin()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py
deleted file mode 100644
index 6d3e48b3..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_rosario import harvest_rosario
-
-@job
-def implnet_job_rosario():
- harvest_rosario()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py
deleted file mode 100644
index 3d186f45..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_rsu import harvest_rsu
-
-@job
-def implnet_job_rsu():
- harvest_rsu()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py
deleted file mode 100644
index 50936c4d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_sceincespo import harvest_sceincespo
-
-@job
-def implnet_job_sceincespo():
- harvest_sceincespo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py
deleted file mode 100644
index da347896..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_tdi import harvest_tdi
-
-@job
-def implnet_job_tdi():
- harvest_tdi()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py
deleted file mode 100644
index 152561b9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_tdl import harvest_tdl
-
-@job
-def implnet_job_tdl():
- harvest_tdl()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py
deleted file mode 100644
index 644d7e03..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ucdl import harvest_ucdl
-
-@job
-def implnet_job_ucdl():
- harvest_ucdl()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py
deleted file mode 100644
index 656bce58..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ucla import harvest_ucla
-
-@job
-def implnet_job_ucla():
- harvest_ucla()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py
deleted file mode 100644
index 004c47b0..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_unb import harvest_unb
-
-@job
-def implnet_job_unb():
- harvest_unb()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py
deleted file mode 100644
index 2d536d65..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_unc import harvest_unc
-
-@job
-def implnet_job_unc():
- harvest_unc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py
deleted file mode 100644
index e4dbcc77..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_uva import harvest_uva
-
-@job
-def implnet_job_uva():
- harvest_uva()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py
deleted file mode 100644
index 2f465b74..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_uwi import harvest_uwi
-
-@job
-def implnet_job_uwi():
- harvest_uwi()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py
deleted file mode 100644
index 416546d7..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_vtti import harvest_vtti
-
-@job
-def implnet_job_vtti():
- harvest_vtti()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py
deleted file mode 100644
index cd0a591c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_wardr import harvest_wardr
-
-@job
-def implnet_job_wardr():
- harvest_wardr()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py
deleted file mode 100644
index 89ae4aea..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_yalenus import harvest_yalenus
-
-@job
-def implnet_job_yalenus():
- harvest_yalenus()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py
deleted file mode 100644
index 761e2018..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def abacus_gleaner(context):
- returned_value = gleanerio(("gleaner"), "abacus")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def abacus_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "abacus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def abacus_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "abacus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def abacus_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "abacus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def abacus_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "abacus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_abacus():
- harvest = abacus_gleaner()
- load1 = abacus_nabu(harvest)
- load2 = abacus_nabuprov(load1)
- load3 = abacus_nabuorg(load2)
- load4 = abacus_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py
deleted file mode 100644
index 93f29b71..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def acss_gleaner(context):
- returned_value = gleanerio(("gleaner"), "acss")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def acss_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "acss")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def acss_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "acss")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def acss_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "acss")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def acss_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "acss")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_acss():
- harvest = acss_gleaner()
- load1 = acss_nabu(harvest)
- load2 = acss_nabuprov(load1)
- load3 = acss_nabuorg(load2)
- load4 = acss_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py
deleted file mode 100644
index 36bf33e9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def adf_gleaner(context):
- returned_value = gleanerio(("gleaner"), "adf")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def adf_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "adf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def adf_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "adf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def adf_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "adf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def adf_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "adf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_adf():
- harvest = adf_gleaner()
- load1 = adf_nabu(harvest)
- load2 = adf_nabuprov(load1)
- load3 = adf_nabuorg(load2)
- load4 = adf_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py
deleted file mode 100644
index 049e266a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def arecibo_gleaner(context):
- returned_value = gleanerio(("gleaner"), "arecibo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def arecibo_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "arecibo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def arecibo_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "arecibo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def arecibo_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "arecibo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def arecibo_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "arecibo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_arecibo():
- harvest = arecibo_gleaner()
- load1 = arecibo_nabu(harvest)
- load2 = arecibo_nabuprov(load1)
- load3 = arecibo_nabuorg(load2)
- load4 = arecibo_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py
deleted file mode 100644
index 4091e128..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def asulrdr_gleaner(context):
- returned_value = gleanerio(("gleaner"), "asulrdr")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def asulrdr_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "asulrdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def asulrdr_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "asulrdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def asulrdr_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "asulrdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def asulrdr_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "asulrdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_asulrdr():
- harvest = asulrdr_gleaner()
- load1 = asulrdr_nabu(harvest)
- load2 = asulrdr_nabuprov(load1)
- load3 = asulrdr_nabuorg(load2)
- load4 = asulrdr_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py
deleted file mode 100644
index 4b9d1849..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def aussda_gleaner(context):
- returned_value = gleanerio(("gleaner"), "aussda")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def aussda_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "aussda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aussda_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "aussda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aussda_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "aussda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aussda_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "aussda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_aussda():
- harvest = aussda_gleaner()
- load1 = aussda_nabu(harvest)
- load2 = aussda_nabuprov(load1)
- load3 = aussda_nabuorg(load2)
- load4 = aussda_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py
deleted file mode 100644
index 83b1a3f3..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def aws_gleaner(context):
- returned_value = gleanerio(("gleaner"), "aws")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def aws_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "aws")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aws_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "aws")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aws_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "aws")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aws_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "aws")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_aws():
- harvest = aws_gleaner()
- load1 = aws_nabu(harvest)
- load2 = aws_nabuprov(load1)
- load3 = aws_nabuorg(load2)
- load4 = aws_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py
deleted file mode 100644
index b3e3b566..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def borealis_gleaner(context):
- returned_value = gleanerio(("gleaner"), "borealis")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def borealis_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "borealis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def borealis_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "borealis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def borealis_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "borealis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def borealis_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "borealis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_borealis():
- harvest = borealis_gleaner()
- load1 = borealis_nabu(harvest)
- load2 = borealis_nabuprov(load1)
- load3 = borealis_nabuorg(load2)
- load4 = borealis_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py
deleted file mode 100644
index 7a66d0c9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def chile_gleaner(context):
- returned_value = gleanerio(("gleaner"), "chile")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def chile_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "chile")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def chile_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "chile")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def chile_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "chile")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def chile_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "chile")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_chile():
- harvest = chile_gleaner()
- load1 = chile_nabu(harvest)
- load2 = chile_nabuprov(load1)
- load3 = chile_nabuorg(load2)
- load4 = chile_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py
deleted file mode 100644
index c78bdd93..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cifor_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cifor")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cifor_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cifor")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cifor_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cifor")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cifor_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cifor")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cifor_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cifor")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cifor():
- harvest = cifor_gleaner()
- load1 = cifor_nabu(harvest)
- load2 = cifor_nabuprov(load1)
- load3 = cifor_nabuorg(load2)
- load4 = cifor_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py
deleted file mode 100644
index dba0a8a8..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cimmyt_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cimmyt")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cimmyt_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cimmyt")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cimmyt_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cimmyt")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cimmyt_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cimmyt")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cimmyt_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cimmyt")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cimmyt():
- harvest = cimmyt_gleaner()
- load1 = cimmyt_nabu(harvest)
- load2 = cimmyt_nabuprov(load1)
- load3 = cimmyt_nabuorg(load2)
- load4 = cimmyt_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py
deleted file mode 100644
index ebb0ae4d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cora_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cora")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cora_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cora")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cora_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cora")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cora_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cora")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cora_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cora")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cora():
- harvest = cora_gleaner()
- load1 = cora_nabu(harvest)
- load2 = cora_nabuprov(load1)
- load3 = cora_nabuorg(load2)
- load4 = cora_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py
deleted file mode 100644
index 7520c9f1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def crossda_gleaner(context):
- returned_value = gleanerio(("gleaner"), "crossda")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def crossda_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "crossda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def crossda_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "crossda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def crossda_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "crossda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def crossda_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "crossda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_crossda():
- harvest = crossda_gleaner()
- load1 = crossda_nabu(harvest)
- load2 = crossda_nabuprov(load1)
- load3 = crossda_nabuorg(load2)
- load4 = crossda_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py
deleted file mode 100644
index 43ff574e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cuhk_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cuhk")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cuhk_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cuhk")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cuhk_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cuhk")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cuhk_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cuhk")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cuhk_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cuhk")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cuhk():
- harvest = cuhk_gleaner()
- load1 = cuhk_nabu(harvest)
- load2 = cuhk_nabuprov(load1)
- load3 = cuhk_nabuorg(load2)
- load4 = cuhk_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py
deleted file mode 100644
index 987064a0..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cyvers_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cyvers")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cyvers_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cyvers")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cyvers_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cyvers")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cyvers_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cyvers")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cyvers_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cyvers")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cyvers():
- harvest = cyvers_gleaner()
- load1 = cyvers_nabu(harvest)
- load2 = cyvers_nabuprov(load1)
- load3 = cyvers_nabuorg(load2)
- load4 = cyvers_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py
deleted file mode 100644
index 27a84d3a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def darus_gleaner(context):
- returned_value = gleanerio(("gleaner"), "darus")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def darus_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "darus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def darus_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "darus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def darus_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "darus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def darus_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "darus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_darus():
- harvest = darus_gleaner()
- load1 = darus_nabu(harvest)
- load2 = darus_nabuprov(load1)
- load3 = darus_nabuorg(load2)
- load4 = darus_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py
deleted file mode 100644
index a4b1f6f9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def drp_gleaner(context):
- returned_value = gleanerio(("gleaner"), "drp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def drp_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "drp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def drp_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "drp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def drp_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "drp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def drp_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "drp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_drp():
- harvest = drp_gleaner()
- load1 = drp_nabu(harvest)
- load2 = drp_nabuprov(load1)
- load3 = drp_nabuorg(load2)
- load4 = drp_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py
deleted file mode 100644
index 05050404..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def dryad_gleaner(context):
- returned_value = gleanerio(("gleaner"), "dryad")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def dryad_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "dryad")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def dryad_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "dryad")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def dryad_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "dryad")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def dryad_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "dryad")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_dryad():
- harvest = dryad_gleaner()
- load1 = dryad_nabu(harvest)
- load2 = dryad_nabuprov(load1)
- load3 = dryad_nabuorg(load2)
- load4 = dryad_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py
deleted file mode 100644
index 3fab6345..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def edatos_gleaner(context):
- returned_value = gleanerio(("gleaner"), "edatos")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def edatos_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "edatos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edatos_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "edatos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edatos_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "edatos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edatos_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "edatos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_edatos():
- harvest = edatos_gleaner()
- load1 = edatos_nabu(harvest)
- load2 = edatos_nabuprov(load1)
- load3 = edatos_nabuorg(load2)
- load4 = edatos_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py
deleted file mode 100644
index 446900c1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def fiu_gleaner(context):
- returned_value = gleanerio(("gleaner"), "fiu")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def fiu_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "fiu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def fiu_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "fiu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def fiu_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "fiu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def fiu_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "fiu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_fiu():
- harvest = fiu_gleaner()
- load1 = fiu_nabu(harvest)
- load2 = fiu_nabuprov(load1)
- load3 = fiu_nabuorg(load2)
- load4 = fiu_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py
deleted file mode 100644
index 577780a1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def gro_gleaner(context):
- returned_value = gleanerio(("gleaner"), "gro")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def gro_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "gro")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def gro_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "gro")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def gro_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "gro")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def gro_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "gro")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_gro():
- harvest = gro_gleaner()
- load1 = gro_nabu(harvest)
- load2 = gro_nabuprov(load1)
- load3 = gro_nabuorg(load2)
- load4 = gro_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py
deleted file mode 100644
index fc6631df..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def harvard_gleaner(context):
- returned_value = gleanerio(("gleaner"), "harvard")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def harvard_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "harvard")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def harvard_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "harvard")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def harvard_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "harvard")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def harvard_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "harvard")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_harvard():
- harvest = harvard_gleaner()
- load1 = harvard_nabu(harvest)
- load2 = harvard_nabuprov(load1)
- load3 = harvard_nabuorg(load2)
- load4 = harvard_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py
deleted file mode 100644
index 0c0f5ab0..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def hopkins_gleaner(context):
- returned_value = gleanerio(("gleaner"), "hopkins")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def hopkins_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "hopkins")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hopkins_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "hopkins")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hopkins_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "hopkins")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hopkins_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "hopkins")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_hopkins():
- harvest = hopkins_gleaner()
- load1 = hopkins_nabu(harvest)
- load2 = hopkins_nabuprov(load1)
- load3 = hopkins_nabuorg(load2)
- load4 = hopkins_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py
deleted file mode 100644
index f10fa238..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def hord_gleaner(context):
- returned_value = gleanerio(("gleaner"), "hord")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def hord_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "hord")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hord_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "hord")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hord_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "hord")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def hord_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "hord")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_hord():
- harvest = hord_gleaner()
- load1 = hord_nabu(harvest)
- load2 = hord_nabuprov(load1)
- load3 = hord_nabuorg(load2)
- load4 = hord_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py
deleted file mode 100644
index 4eaceeb6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ibict_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ibict")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ibict_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ibict")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ibict_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ibict")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ibict_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ibict")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ibict_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ibict")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ibict():
- harvest = ibict_gleaner()
- load1 = ibict_nabu(harvest)
- load2 = ibict_nabuprov(load1)
- load3 = ibict_nabuorg(load2)
- load4 = ibict_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py
deleted file mode 100644
index 1b0f8e74..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def icarda_gleaner(context):
- returned_value = gleanerio(("gleaner"), "icarda")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def icarda_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "icarda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icarda_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "icarda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icarda_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "icarda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icarda_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "icarda")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_icarda():
- harvest = icarda_gleaner()
- load1 = icarda_nabu(harvest)
- load2 = icarda_nabuprov(load1)
- load3 = icarda_nabuorg(load2)
- load4 = icarda_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py
deleted file mode 100644
index dd5832dc..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def icrisat_gleaner(context):
- returned_value = gleanerio(("gleaner"), "icrisat")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def icrisat_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "icrisat")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icrisat_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "icrisat")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icrisat_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "icrisat")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def icrisat_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "icrisat")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_icrisat():
- harvest = icrisat_gleaner()
- load1 = icrisat_nabu(harvest)
- load2 = icrisat_nabuprov(load1)
- load3 = icrisat_nabuorg(load2)
- load4 = icrisat_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py
deleted file mode 100644
index 9bb6fd95..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ifdc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ifdc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ifdc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ifdc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifdc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ifdc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifdc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ifdc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifdc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ifdc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ifdc():
- harvest = ifdc_gleaner()
- load1 = ifdc_nabu(harvest)
- load2 = ifdc_nabuprov(load1)
- load3 = ifdc_nabuorg(load2)
- load4 = ifdc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py
deleted file mode 100644
index 7b75704d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ifsttar_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ifsttar")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ifsttar_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ifsttar")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifsttar_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ifsttar")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifsttar_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ifsttar")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ifsttar_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ifsttar")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ifsttar():
- harvest = ifsttar_gleaner()
- load1 = ifsttar_nabu(harvest)
- load2 = ifsttar_nabuprov(load1)
- load3 = ifsttar_nabuorg(load2)
- load4 = ifsttar_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py
deleted file mode 100644
index e128fbfb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def iisg_gleaner(context):
- returned_value = gleanerio(("gleaner"), "iisg")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def iisg_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "iisg")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iisg_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "iisg")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iisg_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "iisg")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iisg_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "iisg")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_iisg():
- harvest = iisg_gleaner()
- load1 = iisg_nabu(harvest)
- load2 = iisg_nabuprov(load1)
- load3 = iisg_nabuorg(load2)
- load4 = iisg_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py
deleted file mode 100644
index 9939a812..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def iit_gleaner(context):
- returned_value = gleanerio(("gleaner"), "iit")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def iit_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "iit")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iit_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "iit")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iit_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "iit")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def iit_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "iit")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_iit():
- harvest = iit_gleaner()
- load1 = iit_nabu(harvest)
- load2 = iit_nabuprov(load1)
- load3 = iit_nabuorg(load2)
- load4 = iit_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py
deleted file mode 100644
index c7558d0c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ipc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ipc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ipc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ipc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ipc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ipc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ipc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ipc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ipc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ipc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ipc():
- harvest = ipc_gleaner()
- load1 = ipc_nabu(harvest)
- load2 = ipc_nabuprov(load1)
- load3 = ipc_nabuorg(load2)
- load4 = ipc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py
deleted file mode 100644
index 53b10ae5..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def irl_gleaner(context):
- returned_value = gleanerio(("gleaner"), "irl")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def irl_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "irl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irl_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "irl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irl_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "irl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irl_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "irl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_irl():
- harvest = irl_gleaner()
- load1 = irl_nabu(harvest)
- load2 = irl_nabuprov(load1)
- load3 = irl_nabuorg(load2)
- load4 = irl_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py
deleted file mode 100644
index 8449f403..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def irs_gleaner(context):
- returned_value = gleanerio(("gleaner"), "irs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def irs_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "irs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irs_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "irs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irs_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "irs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def irs_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "irs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_irs():
- harvest = irs_gleaner()
- load1 = irs_nabu(harvest)
- load2 = irs_nabuprov(load1)
- load3 = irs_nabuorg(load2)
- load4 = irs_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py
deleted file mode 100644
index a7d3cddc..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def julich_gleaner(context):
- returned_value = gleanerio(("gleaner"), "julich")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def julich_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "julich")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def julich_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "julich")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def julich_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "julich")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def julich_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "julich")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_julich():
- harvest = julich_gleaner()
- load1 = julich_nabu(harvest)
- load2 = julich_nabuprov(load1)
- load3 = julich_nabuorg(load2)
- load4 = julich_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py
deleted file mode 100644
index d37854c8..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def lida_gleaner(context):
- returned_value = gleanerio(("gleaner"), "lida")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def lida_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "lida")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def lida_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "lida")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def lida_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "lida")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def lida_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "lida")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_lida():
- harvest = lida_gleaner()
- load1 = lida_nabu(harvest)
- load2 = lida_nabuprov(load1)
- load3 = lida_nabuorg(load2)
- load4 = lida_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py
deleted file mode 100644
index ec4077d7..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def manitoba_gleaner(context):
- returned_value = gleanerio(("gleaner"), "manitoba")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def manitoba_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "manitoba")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def manitoba_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "manitoba")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def manitoba_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "manitoba")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def manitoba_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "manitoba")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_manitoba():
- harvest = manitoba_gleaner()
- load1 = manitoba_nabu(harvest)
- load2 = manitoba_nabuprov(load1)
- load3 = manitoba_nabuorg(load2)
- load4 = manitoba_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py
deleted file mode 100644
index 1f8516de..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def matcommons_gleaner(context):
- returned_value = gleanerio(("gleaner"), "matcommons")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def matcommons_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "matcommons")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def matcommons_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "matcommons")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def matcommons_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "matcommons")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def matcommons_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "matcommons")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_matcommons():
- harvest = matcommons_gleaner()
- load1 = matcommons_nabu(harvest)
- load2 = matcommons_nabuprov(load1)
- load3 = matcommons_nabuorg(load2)
- load4 = matcommons_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py
deleted file mode 100644
index 30660773..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def mdf_gleaner(context):
- returned_value = gleanerio(("gleaner"), "mdf")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def mdf_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "mdf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def mdf_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "mdf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def mdf_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "mdf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def mdf_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "mdf")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_mdf():
- harvest = mdf_gleaner()
- load1 = mdf_nabu(harvest)
- load2 = mdf_nabuprov(load1)
- load3 = mdf_nabuorg(load2)
- load4 = mdf_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py
deleted file mode 100644
index 4460a79a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def milano_gleaner(context):
- returned_value = gleanerio(("gleaner"), "milano")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def milano_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "milano")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def milano_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "milano")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def milano_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "milano")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def milano_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "milano")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_milano():
- harvest = milano_gleaner()
- load1 = milano_nabu(harvest)
- load2 = milano_nabuprov(load1)
- load3 = milano_nabuorg(load2)
- load4 = milano_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py
deleted file mode 100644
index 8d300b6a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def neon_gleaner(context):
- returned_value = gleanerio(("gleaner"), "neon")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def neon_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "neon")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def neon_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "neon")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def neon_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "neon")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def neon_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "neon")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_neon():
- harvest = neon_gleaner()
- load1 = neon_nabu(harvest)
- load2 = neon_nabuprov(load1)
- load3 = neon_nabuorg(load2)
- load4 = neon_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py
deleted file mode 100644
index 7e73cb88..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def netherland_gleaner(context):
- returned_value = gleanerio(("gleaner"), "netherland")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def netherland_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "netherland")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def netherland_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "netherland")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def netherland_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "netherland")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def netherland_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "netherland")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_netherland():
- harvest = netherland_gleaner()
- load1 = netherland_nabu(harvest)
- load2 = netherland_nabuprov(load1)
- load3 = netherland_nabuorg(load2)
- load4 = netherland_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py
deleted file mode 100644
index 8a63dac1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def nioz_gleaner(context):
- returned_value = gleanerio(("gleaner"), "nioz")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def nioz_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "nioz")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def nioz_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "nioz")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def nioz_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "nioz")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def nioz_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "nioz")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_nioz():
- harvest = nioz_gleaner()
- load1 = nioz_nabu(harvest)
- load2 = nioz_nabuprov(load1)
- load3 = nioz_nabuorg(load2)
- load4 = nioz_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py
deleted file mode 100644
index 777318c6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def norway_gleaner(context):
- returned_value = gleanerio(("gleaner"), "norway")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def norway_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "norway")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def norway_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "norway")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def norway_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "norway")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def norway_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "norway")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_norway():
- harvest = norway_gleaner()
- load1 = norway_nabu(harvest)
- load2 = norway_nabuprov(load1)
- load3 = norway_nabuorg(load2)
- load4 = norway_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py
deleted file mode 100644
index e5fbe902..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ntu_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ntu")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ntu_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ntu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ntu_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ntu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ntu_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ntu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ntu_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ntu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ntu():
- harvest = ntu_gleaner()
- load1 = ntu_nabu(harvest)
- load2 = ntu_nabuprov(load1)
- load3 = ntu_nabuorg(load2)
- load4 = ntu_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py
deleted file mode 100644
index 6ad1294a..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ofd_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ofd")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ofd_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ofd")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ofd_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ofd")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ofd_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ofd")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ofd_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ofd")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ofd():
- harvest = ofd_gleaner()
- load1 = ofd_nabu(harvest)
- load2 = ofd_nabuprov(load1)
- load3 = ofd_nabuorg(load2)
- load4 = ofd_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py
deleted file mode 100644
index e32c7fd8..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def peking_gleaner(context):
- returned_value = gleanerio(("gleaner"), "peking")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def peking_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "peking")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def peking_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "peking")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def peking_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "peking")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def peking_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "peking")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_peking():
- harvest = peking_gleaner()
- load1 = peking_nabu(harvest)
- load2 = peking_nabuprov(load1)
- load3 = peking_nabuorg(load2)
- load4 = peking_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py
deleted file mode 100644
index be22fa7e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def pesquisa_gleaner(context):
- returned_value = gleanerio(("gleaner"), "pesquisa")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def pesquisa_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "pesquisa")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pesquisa_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "pesquisa")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pesquisa_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "pesquisa")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pesquisa_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "pesquisa")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_pesquisa():
- harvest = pesquisa_gleaner()
- load1 = pesquisa_nabu(harvest)
- load2 = pesquisa_nabuprov(load1)
- load3 = pesquisa_nabuorg(load2)
- load4 = pesquisa_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py
deleted file mode 100644
index 4ccc7107..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def pucdp_gleaner(context):
- returned_value = gleanerio(("gleaner"), "pucdp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def pucdp_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "pucdp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pucdp_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "pucdp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pucdp_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "pucdp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pucdp_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "pucdp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_pucdp():
- harvest = pucdp_gleaner()
- load1 = pucdp_nabu(harvest)
- load2 = pucdp_nabuprov(load1)
- load3 = pucdp_nabuorg(load2)
- load4 = pucdp_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py
deleted file mode 100644
index 09aaa3cb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def qdr_gleaner(context):
- returned_value = gleanerio(("gleaner"), "qdr")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def qdr_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "qdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def qdr_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "qdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def qdr_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "qdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def qdr_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "qdr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_qdr():
- harvest = qdr_gleaner()
- load1 = qdr_nabu(harvest)
- load2 = qdr_nabuprov(load1)
- load3 = qdr_nabuorg(load2)
- load4 = qdr_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py
deleted file mode 100644
index cf50affb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def rin_gleaner(context):
- returned_value = gleanerio(("gleaner"), "rin")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def rin_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "rin")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rin_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "rin")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rin_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "rin")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rin_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "rin")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_rin():
- harvest = rin_gleaner()
- load1 = rin_nabu(harvest)
- load2 = rin_nabuprov(load1)
- load3 = rin_nabuorg(load2)
- load4 = rin_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py
deleted file mode 100644
index 8a9f398e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def rosario_gleaner(context):
- returned_value = gleanerio(("gleaner"), "rosario")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def rosario_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "rosario")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rosario_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "rosario")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rosario_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "rosario")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rosario_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "rosario")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_rosario():
- harvest = rosario_gleaner()
- load1 = rosario_nabu(harvest)
- load2 = rosario_nabuprov(load1)
- load3 = rosario_nabuorg(load2)
- load4 = rosario_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py
deleted file mode 100644
index b3f9693d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def rsu_gleaner(context):
- returned_value = gleanerio(("gleaner"), "rsu")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def rsu_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "rsu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rsu_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "rsu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rsu_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "rsu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def rsu_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "rsu")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_rsu():
- harvest = rsu_gleaner()
- load1 = rsu_nabu(harvest)
- load2 = rsu_nabuprov(load1)
- load3 = rsu_nabuorg(load2)
- load4 = rsu_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py
deleted file mode 100644
index 1c15a43e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def sceincespo_gleaner(context):
- returned_value = gleanerio(("gleaner"), "sceincespo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def sceincespo_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "sceincespo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def sceincespo_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "sceincespo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def sceincespo_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "sceincespo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def sceincespo_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "sceincespo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_sceincespo():
- harvest = sceincespo_gleaner()
- load1 = sceincespo_nabu(harvest)
- load2 = sceincespo_nabuprov(load1)
- load3 = sceincespo_nabuorg(load2)
- load4 = sceincespo_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py
deleted file mode 100644
index d6bd59ed..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def tdi_gleaner(context):
- returned_value = gleanerio(("gleaner"), "tdi")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def tdi_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "tdi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdi_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "tdi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdi_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "tdi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdi_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "tdi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_tdi():
- harvest = tdi_gleaner()
- load1 = tdi_nabu(harvest)
- load2 = tdi_nabuprov(load1)
- load3 = tdi_nabuorg(load2)
- load4 = tdi_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py
deleted file mode 100644
index 344d5c8e..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def tdl_gleaner(context):
- returned_value = gleanerio(("gleaner"), "tdl")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def tdl_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "tdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdl_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "tdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdl_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "tdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def tdl_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "tdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_tdl():
- harvest = tdl_gleaner()
- load1 = tdl_nabu(harvest)
- load2 = tdl_nabuprov(load1)
- load3 = tdl_nabuorg(load2)
- load4 = tdl_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py
deleted file mode 100644
index 69b8ffea..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ucdl_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ucdl")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ucdl_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ucdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucdl_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ucdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucdl_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ucdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucdl_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ucdl")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ucdl():
- harvest = ucdl_gleaner()
- load1 = ucdl_nabu(harvest)
- load2 = ucdl_nabuprov(load1)
- load3 = ucdl_nabuorg(load2)
- load4 = ucdl_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py
deleted file mode 100644
index c9311d11..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def ucla_gleaner(context):
- returned_value = gleanerio(("gleaner"), "ucla")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ucla_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "ucla")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucla_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "ucla")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucla_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "ucla")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ucla_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "ucla")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_ucla():
- harvest = ucla_gleaner()
- load1 = ucla_nabu(harvest)
- load2 = ucla_nabuprov(load1)
- load3 = ucla_nabuorg(load2)
- load4 = ucla_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py
deleted file mode 100644
index be9c2bfb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def unb_gleaner(context):
- returned_value = gleanerio(("gleaner"), "unb")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def unb_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "unb")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unb_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "unb")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unb_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "unb")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unb_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "unb")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_unb():
- harvest = unb_gleaner()
- load1 = unb_nabu(harvest)
- load2 = unb_nabuprov(load1)
- load3 = unb_nabuorg(load2)
- load4 = unb_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py
deleted file mode 100644
index e28ebfff..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def unc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "unc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def unc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "unc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "unc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "unc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def unc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "unc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_unc():
- harvest = unc_gleaner()
- load1 = unc_nabu(harvest)
- load2 = unc_nabuprov(load1)
- load3 = unc_nabuorg(load2)
- load4 = unc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py
deleted file mode 100644
index de81e633..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def uva_gleaner(context):
- returned_value = gleanerio(("gleaner"), "uva")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def uva_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "uva")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uva_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "uva")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uva_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "uva")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uva_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "uva")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_uva():
- harvest = uva_gleaner()
- load1 = uva_nabu(harvest)
- load2 = uva_nabuprov(load1)
- load3 = uva_nabuorg(load2)
- load4 = uva_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py
deleted file mode 100644
index 19917adf..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def uwi_gleaner(context):
- returned_value = gleanerio(("gleaner"), "uwi")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def uwi_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "uwi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uwi_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "uwi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uwi_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "uwi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def uwi_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "uwi")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_uwi():
- harvest = uwi_gleaner()
- load1 = uwi_nabu(harvest)
- load2 = uwi_nabuprov(load1)
- load3 = uwi_nabuorg(load2)
- load4 = uwi_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py
deleted file mode 100644
index 2e73d3f2..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def vtti_gleaner(context):
- returned_value = gleanerio(("gleaner"), "vtti")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def vtti_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "vtti")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def vtti_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "vtti")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def vtti_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "vtti")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def vtti_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "vtti")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_vtti():
- harvest = vtti_gleaner()
- load1 = vtti_nabu(harvest)
- load2 = vtti_nabuprov(load1)
- load3 = vtti_nabuorg(load2)
- load4 = vtti_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py
deleted file mode 100644
index ccf5be3b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def wardr_gleaner(context):
- returned_value = gleanerio(("gleaner"), "wardr")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def wardr_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "wardr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def wardr_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "wardr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def wardr_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "wardr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def wardr_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "wardr")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_wardr():
- harvest = wardr_gleaner()
- load1 = wardr_nabu(harvest)
- load2 = wardr_nabuprov(load1)
- load3 = wardr_nabuorg(load2)
- load4 = wardr_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py
deleted file mode 100644
index 301b6b11..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import distutils
-
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def yalenus_gleaner(context):
- returned_value = gleanerio(("gleaner"), "yalenus")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def yalenus_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "yalenus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def yalenus_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "yalenus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def yalenus_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "yalenus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def yalenus_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "yalenus")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_yalenus():
- harvest = yalenus_gleaner()
- load1 = yalenus_nabu(harvest)
- load2 = yalenus_nabuprov(load1)
- load3 = yalenus_nabuorg(load2)
- load4 = yalenus_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py
deleted file mode 100644
index 8238ee49..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from dagster import repository
-from jobs.implnet_jobs_arecibo import implnet_job_arecibo
-from sch.implnet_sch_arecibo import implnet_sch_arecibo
-from jobs.implnet_jobs_aws import implnet_job_aws
-from sch.implnet_sch_aws import implnet_sch_aws
-from jobs.implnet_jobs_cyvers import implnet_job_cyvers
-from sch.implnet_sch_cyvers import implnet_sch_cyvers
-from jobs.implnet_jobs_drp import implnet_job_drp
-from sch.implnet_sch_drp import implnet_sch_drp
-from jobs.implnet_jobs_dryad import implnet_job_dryad
-from sch.implnet_sch_dryad import implnet_sch_dryad
-from jobs.implnet_jobs_matcommons import implnet_job_matcommons
-from sch.implnet_sch_matcommons import implnet_sch_matcommons
-from jobs.implnet_jobs_mdf import implnet_job_mdf
-from sch.implnet_sch_mdf import implnet_sch_mdf
-from jobs.implnet_jobs_neon import implnet_job_neon
-from sch.implnet_sch_neon import implnet_sch_neon
-from jobs.implnet_jobs_abacus import implnet_job_abacus
-from sch.implnet_sch_abacus import implnet_sch_abacus
-from jobs.implnet_jobs_acss import implnet_job_acss
-from sch.implnet_sch_acss import implnet_sch_acss
-from jobs.implnet_jobs_adf import implnet_job_adf
-from sch.implnet_sch_adf import implnet_sch_adf
-from jobs.implnet_jobs_asulrdr import implnet_job_asulrdr
-from sch.implnet_sch_asulrdr import implnet_sch_asulrdr
-from jobs.implnet_jobs_aussda import implnet_job_aussda
-from sch.implnet_sch_aussda import implnet_sch_aussda
-from jobs.implnet_jobs_borealis import implnet_job_borealis
-from sch.implnet_sch_borealis import implnet_sch_borealis
-from jobs.implnet_jobs_cifor import implnet_job_cifor
-from sch.implnet_sch_cifor import implnet_sch_cifor
-from jobs.implnet_jobs_cimmyt import implnet_job_cimmyt
-from sch.implnet_sch_cimmyt import implnet_sch_cimmyt
-from jobs.implnet_jobs_cora import implnet_job_cora
-from sch.implnet_sch_cora import implnet_sch_cora
-from jobs.implnet_jobs_crossda import implnet_job_crossda
-from sch.implnet_sch_crossda import implnet_sch_crossda
-from jobs.implnet_jobs_cuhk import implnet_job_cuhk
-from sch.implnet_sch_cuhk import implnet_sch_cuhk
-from jobs.implnet_jobs_tdi import implnet_job_tdi
-from sch.implnet_sch_tdi import implnet_sch_tdi
-from jobs.implnet_jobs_darus import implnet_job_darus
-from sch.implnet_sch_darus import implnet_sch_darus
-from jobs.implnet_jobs_irs import implnet_job_irs
-from sch.implnet_sch_irs import implnet_sch_irs
-from jobs.implnet_jobs_sceincespo import implnet_job_sceincespo
-from sch.implnet_sch_sceincespo import implnet_sch_sceincespo
-from jobs.implnet_jobs_edatos import implnet_job_edatos
-from sch.implnet_sch_edatos import implnet_sch_edatos
-from jobs.implnet_jobs_netherland import implnet_job_netherland
-from sch.implnet_sch_netherland import implnet_sch_netherland
-from jobs.implnet_jobs_norway import implnet_job_norway
-from sch.implnet_sch_norway import implnet_sch_norway
-from jobs.implnet_jobs_ntu import implnet_job_ntu
-from sch.implnet_sch_ntu import implnet_sch_ntu
-from jobs.implnet_jobs_fiu import implnet_job_fiu
-from sch.implnet_sch_fiu import implnet_sch_fiu
-from jobs.implnet_jobs_gro import implnet_job_gro
-from sch.implnet_sch_gro import implnet_sch_gro
-from jobs.implnet_jobs_harvard import implnet_job_harvard
-from sch.implnet_sch_harvard import implnet_sch_harvard
-from jobs.implnet_jobs_hord import implnet_job_hord
-from sch.implnet_sch_hord import implnet_sch_hord
-from jobs.implnet_jobs_ibict import implnet_job_ibict
-from sch.implnet_sch_ibict import implnet_sch_ibict
-from jobs.implnet_jobs_icrisat import implnet_job_icrisat
-from sch.implnet_sch_icrisat import implnet_sch_icrisat
-from jobs.implnet_jobs_ifdc import implnet_job_ifdc
-from sch.implnet_sch_ifdc import implnet_sch_ifdc
-from jobs.implnet_jobs_ifsttar import implnet_job_ifsttar
-from sch.implnet_sch_ifsttar import implnet_sch_ifsttar
-from jobs.implnet_jobs_iisg import implnet_job_iisg
-from sch.implnet_sch_iisg import implnet_sch_iisg
-from jobs.implnet_jobs_irl import implnet_job_irl
-from sch.implnet_sch_irl import implnet_sch_irl
-from jobs.implnet_jobs_ipc import implnet_job_ipc
-from sch.implnet_sch_ipc import implnet_sch_ipc
-from jobs.implnet_jobs_iit import implnet_job_iit
-from sch.implnet_sch_iit import implnet_sch_iit
-from jobs.implnet_jobs_hopkins import implnet_job_hopkins
-from sch.implnet_sch_hopkins import implnet_sch_hopkins
-from jobs.implnet_jobs_julich import implnet_job_julich
-from sch.implnet_sch_julich import implnet_sch_julich
-from jobs.implnet_jobs_uva import implnet_job_uva
-from sch.implnet_sch_uva import implnet_sch_uva
-from jobs.implnet_jobs_rin import implnet_job_rin
-from sch.implnet_sch_rin import implnet_sch_rin
-from jobs.implnet_jobs_lida import implnet_job_lida
-from sch.implnet_sch_lida import implnet_sch_lida
-from jobs.implnet_jobs_icarda import implnet_job_icarda
-from sch.implnet_sch_icarda import implnet_sch_icarda
-from jobs.implnet_jobs_nioz import implnet_job_nioz
-from sch.implnet_sch_nioz import implnet_sch_nioz
-from jobs.implnet_jobs_ucdl import implnet_job_ucdl
-from sch.implnet_sch_ucdl import implnet_sch_ucdl
-from jobs.implnet_jobs_ofd import implnet_job_ofd
-from sch.implnet_sch_ofd import implnet_sch_ofd
-from jobs.implnet_jobs_peking import implnet_job_peking
-from sch.implnet_sch_peking import implnet_sch_peking
-from jobs.implnet_jobs_pucdp import implnet_job_pucdp
-from sch.implnet_sch_pucdp import implnet_sch_pucdp
-from jobs.implnet_jobs_qdr import implnet_job_qdr
-from sch.implnet_sch_qdr import implnet_sch_qdr
-from jobs.implnet_jobs_chile import implnet_job_chile
-from sch.implnet_sch_chile import implnet_sch_chile
-from jobs.implnet_jobs_rosario import implnet_job_rosario
-from sch.implnet_sch_rosario import implnet_sch_rosario
-from jobs.implnet_jobs_pesquisa import implnet_job_pesquisa
-from sch.implnet_sch_pesquisa import implnet_sch_pesquisa
-from jobs.implnet_jobs_rsu import implnet_job_rsu
-from sch.implnet_sch_rsu import implnet_sch_rsu
-from jobs.implnet_jobs_tdl import implnet_job_tdl
-from sch.implnet_sch_tdl import implnet_sch_tdl
-from jobs.implnet_jobs_ucla import implnet_job_ucla
-from sch.implnet_sch_ucla import implnet_sch_ucla
-from jobs.implnet_jobs_unb import implnet_job_unb
-from sch.implnet_sch_unb import implnet_sch_unb
-from jobs.implnet_jobs_unc import implnet_job_unc
-from sch.implnet_sch_unc import implnet_sch_unc
-from jobs.implnet_jobs_manitoba import implnet_job_manitoba
-from sch.implnet_sch_manitoba import implnet_sch_manitoba
-from jobs.implnet_jobs_milano import implnet_job_milano
-from sch.implnet_sch_milano import implnet_sch_milano
-from jobs.implnet_jobs_uwi import implnet_job_uwi
-from sch.implnet_sch_uwi import implnet_sch_uwi
-from jobs.implnet_jobs_vtti import implnet_job_vtti
-from sch.implnet_sch_vtti import implnet_sch_vtti
-from jobs.implnet_jobs_wardr import implnet_job_wardr
-from sch.implnet_sch_wardr import implnet_sch_wardr
-from jobs.implnet_jobs_yalenus import implnet_job_yalenus
-from sch.implnet_sch_yalenus import implnet_sch_yalenus
-
-@repository
-def gleaner():
- jobs = [implnet_job_arecibo, implnet_job_aws, implnet_job_cyvers, implnet_job_drp, implnet_job_dryad, implnet_job_matcommons, implnet_job_mdf, implnet_job_neon, implnet_job_abacus, implnet_job_acss, implnet_job_adf, implnet_job_asulrdr, implnet_job_aussda, implnet_job_borealis, implnet_job_cifor, implnet_job_cimmyt, implnet_job_cora, implnet_job_crossda, implnet_job_cuhk, implnet_job_tdi, implnet_job_darus, implnet_job_irs, implnet_job_sceincespo, implnet_job_edatos, implnet_job_netherland, implnet_job_norway, implnet_job_ntu, implnet_job_fiu, implnet_job_gro, implnet_job_harvard, implnet_job_hord, implnet_job_ibict, implnet_job_icrisat, implnet_job_ifdc, implnet_job_ifsttar, implnet_job_iisg, implnet_job_irl, implnet_job_ipc, implnet_job_iit, implnet_job_hopkins, implnet_job_julich, implnet_job_uva, implnet_job_rin, implnet_job_lida, implnet_job_icarda, implnet_job_nioz, implnet_job_ucdl, implnet_job_ofd, implnet_job_peking, implnet_job_pucdp, implnet_job_qdr, implnet_job_chile, implnet_job_rosario, implnet_job_pesquisa, implnet_job_rsu, implnet_job_tdl, implnet_job_ucla, implnet_job_unb, implnet_job_unc, implnet_job_manitoba, implnet_job_milano, implnet_job_uwi, implnet_job_vtti, implnet_job_wardr, implnet_job_yalenus]
- schedules = [implnet_sch_arecibo, implnet_sch_aws, implnet_sch_cyvers, implnet_sch_drp, implnet_sch_dryad, implnet_sch_matcommons, implnet_sch_mdf, implnet_sch_neon, implnet_sch_abacus, implnet_sch_acss, implnet_sch_adf, implnet_sch_asulrdr, implnet_sch_aussda, implnet_sch_borealis, implnet_sch_cifor, implnet_sch_cimmyt, implnet_sch_cora, implnet_sch_crossda, implnet_sch_cuhk, implnet_sch_tdi, implnet_sch_darus, implnet_sch_irs, implnet_sch_sceincespo, implnet_sch_edatos, implnet_sch_netherland, implnet_sch_norway, implnet_sch_ntu, implnet_sch_fiu, implnet_sch_gro, implnet_sch_harvard, implnet_sch_hord, implnet_sch_ibict, implnet_sch_icrisat, implnet_sch_ifdc, implnet_sch_ifsttar, implnet_sch_iisg, implnet_sch_irl, implnet_sch_ipc, implnet_sch_iit, implnet_sch_hopkins, implnet_sch_julich, implnet_sch_uva, implnet_sch_rin, implnet_sch_lida, implnet_sch_icarda, implnet_sch_nioz, implnet_sch_ucdl, implnet_sch_ofd, implnet_sch_peking, implnet_sch_pucdp, implnet_sch_qdr, implnet_sch_chile, implnet_sch_rosario, implnet_sch_pesquisa, implnet_sch_rsu, implnet_sch_tdl, implnet_sch_ucla, implnet_sch_unb, implnet_sch_unc, implnet_sch_manitoba, implnet_sch_milano, implnet_sch_uwi, implnet_sch_vtti, implnet_sch_wardr, implnet_sch_yalenus]
-
-
- return jobs + schedules
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py
deleted file mode 100644
index 11ab7013..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_abacus import implnet_job_abacus
-
-@schedule(cron_schedule="0 0 * * 1", job=implnet_job_abacus, execution_timezone="US/Central")
-def implnet_sch_abacus(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py
deleted file mode 100644
index 91b4bd20..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_acss import implnet_job_acss
-
-@schedule(cron_schedule="0 3 * * 1", job=implnet_job_acss, execution_timezone="US/Central")
-def implnet_sch_acss(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py
deleted file mode 100644
index aab8c315..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_adf import implnet_job_adf
-
-@schedule(cron_schedule="0 6 * * 1", job=implnet_job_adf, execution_timezone="US/Central")
-def implnet_sch_adf(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py
deleted file mode 100644
index 24b535b8..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_arecibo import implnet_job_arecibo
-
-@schedule(cron_schedule="0 0 * * 0", job=implnet_job_arecibo, execution_timezone="US/Central")
-def implnet_sch_arecibo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py
deleted file mode 100644
index d751d8c1..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_asulrdr import implnet_job_asulrdr
-
-@schedule(cron_schedule="0 9 * * 1", job=implnet_job_asulrdr, execution_timezone="US/Central")
-def implnet_sch_asulrdr(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py
deleted file mode 100644
index 5e3a7de5..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_aussda import implnet_job_aussda
-
-@schedule(cron_schedule="0 12 * * 1", job=implnet_job_aussda, execution_timezone="US/Central")
-def implnet_sch_aussda(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py
deleted file mode 100644
index e078f291..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_aws import implnet_job_aws
-
-@schedule(cron_schedule="0 3 * * 0", job=implnet_job_aws, execution_timezone="US/Central")
-def implnet_sch_aws(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py
deleted file mode 100644
index 6bea8376..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_borealis import implnet_job_borealis
-
-@schedule(cron_schedule="0 15 * * 1", job=implnet_job_borealis, execution_timezone="US/Central")
-def implnet_sch_borealis(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py
deleted file mode 100644
index ea503613..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_chile import implnet_job_chile
-
-@schedule(cron_schedule="0 9 * * 6", job=implnet_job_chile, execution_timezone="US/Central")
-def implnet_sch_chile(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py
deleted file mode 100644
index 0cfb6379..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cifor import implnet_job_cifor
-
-@schedule(cron_schedule="0 18 * * 1", job=implnet_job_cifor, execution_timezone="US/Central")
-def implnet_sch_cifor(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py
deleted file mode 100644
index 2a1362af..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cimmyt import implnet_job_cimmyt
-
-@schedule(cron_schedule="0 21 * * 1", job=implnet_job_cimmyt, execution_timezone="US/Central")
-def implnet_sch_cimmyt(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py
deleted file mode 100644
index 74c44435..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cora import implnet_job_cora
-
-@schedule(cron_schedule="0 0 * * 2", job=implnet_job_cora, execution_timezone="US/Central")
-def implnet_sch_cora(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py
deleted file mode 100644
index 754bb45d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_crossda import implnet_job_crossda
-
-@schedule(cron_schedule="0 3 * * 2", job=implnet_job_crossda, execution_timezone="US/Central")
-def implnet_sch_crossda(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py
deleted file mode 100644
index 8be91a5d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cuhk import implnet_job_cuhk
-
-@schedule(cron_schedule="0 6 * * 2", job=implnet_job_cuhk, execution_timezone="US/Central")
-def implnet_sch_cuhk(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py
deleted file mode 100644
index fe5dbd1b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cyvers import implnet_job_cyvers
-
-@schedule(cron_schedule="0 6 * * 0", job=implnet_job_cyvers, execution_timezone="US/Central")
-def implnet_sch_cyvers(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py
deleted file mode 100644
index ce481a78..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_darus import implnet_job_darus
-
-@schedule(cron_schedule="0 12 * * 2", job=implnet_job_darus, execution_timezone="US/Central")
-def implnet_sch_darus(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py
deleted file mode 100644
index 58db9727..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_drp import implnet_job_drp
-
-@schedule(cron_schedule="0 9 * * 0", job=implnet_job_drp, execution_timezone="US/Central")
-def implnet_sch_drp(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py
deleted file mode 100644
index d8d38019..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_dryad import implnet_job_dryad
-
-@schedule(cron_schedule="0 12 * * 0", job=implnet_job_dryad, execution_timezone="US/Central")
-def implnet_sch_dryad(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py
deleted file mode 100644
index ebcf8c9c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_edatos import implnet_job_edatos
-
-@schedule(cron_schedule="0 21 * * 2", job=implnet_job_edatos, execution_timezone="US/Central")
-def implnet_sch_edatos(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py
deleted file mode 100644
index aabd7ded..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_fiu import implnet_job_fiu
-
-@schedule(cron_schedule="0 9 * * 3", job=implnet_job_fiu, execution_timezone="US/Central")
-def implnet_sch_fiu(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py
deleted file mode 100644
index 1b90b0be..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_gro import implnet_job_gro
-
-@schedule(cron_schedule="0 12 * * 3", job=implnet_job_gro, execution_timezone="US/Central")
-def implnet_sch_gro(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py
deleted file mode 100644
index 56264ff6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_harvard import implnet_job_harvard
-
-@schedule(cron_schedule="0 15 * * 3", job=implnet_job_harvard, execution_timezone="US/Central")
-def implnet_sch_harvard(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py
deleted file mode 100644
index 556a21f9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hopkins import implnet_job_hopkins
-
-@schedule(cron_schedule="0 21 * * 4", job=implnet_job_hopkins, execution_timezone="US/Central")
-def implnet_sch_hopkins(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py
deleted file mode 100644
index 745e7e5d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_hord import implnet_job_hord
-
-@schedule(cron_schedule="0 18 * * 3", job=implnet_job_hord, execution_timezone="US/Central")
-def implnet_sch_hord(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py
deleted file mode 100644
index dd122902..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ibict import implnet_job_ibict
-
-@schedule(cron_schedule="0 21 * * 3", job=implnet_job_ibict, execution_timezone="US/Central")
-def implnet_sch_ibict(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py
deleted file mode 100644
index 6a5b77dd..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_icarda import implnet_job_icarda
-
-@schedule(cron_schedule="0 12 * * 5", job=implnet_job_icarda, execution_timezone="US/Central")
-def implnet_sch_icarda(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py
deleted file mode 100644
index 16aae845..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_icrisat import implnet_job_icrisat
-
-@schedule(cron_schedule="0 0 * * 4", job=implnet_job_icrisat, execution_timezone="US/Central")
-def implnet_sch_icrisat(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py
deleted file mode 100644
index 8ad696d4..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ifdc import implnet_job_ifdc
-
-@schedule(cron_schedule="0 3 * * 4", job=implnet_job_ifdc, execution_timezone="US/Central")
-def implnet_sch_ifdc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py
deleted file mode 100644
index 6ec28f67..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ifsttar import implnet_job_ifsttar
-
-@schedule(cron_schedule="0 6 * * 4", job=implnet_job_ifsttar, execution_timezone="US/Central")
-def implnet_sch_ifsttar(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py
deleted file mode 100644
index 3c896328..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_iisg import implnet_job_iisg
-
-@schedule(cron_schedule="0 9 * * 4", job=implnet_job_iisg, execution_timezone="US/Central")
-def implnet_sch_iisg(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py
deleted file mode 100644
index ebea679c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_iit import implnet_job_iit
-
-@schedule(cron_schedule="0 18 * * 4", job=implnet_job_iit, execution_timezone="US/Central")
-def implnet_sch_iit(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py
deleted file mode 100644
index 7411b495..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ipc import implnet_job_ipc
-
-@schedule(cron_schedule="0 15 * * 4", job=implnet_job_ipc, execution_timezone="US/Central")
-def implnet_sch_ipc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py
deleted file mode 100644
index b3cab340..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_irl import implnet_job_irl
-
-@schedule(cron_schedule="0 12 * * 4", job=implnet_job_irl, execution_timezone="US/Central")
-def implnet_sch_irl(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py
deleted file mode 100644
index e3038c44..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_irs import implnet_job_irs
-
-@schedule(cron_schedule="0 15 * * 2", job=implnet_job_irs, execution_timezone="US/Central")
-def implnet_sch_irs(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py
deleted file mode 100644
index e9d14937..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_julich import implnet_job_julich
-
-@schedule(cron_schedule="0 0 * * 5", job=implnet_job_julich, execution_timezone="US/Central")
-def implnet_sch_julich(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py
deleted file mode 100644
index 1f9fcf74..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_lida import implnet_job_lida
-
-@schedule(cron_schedule="0 9 * * 5", job=implnet_job_lida, execution_timezone="US/Central")
-def implnet_sch_lida(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py
deleted file mode 100644
index e736c4af..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_manitoba import implnet_job_manitoba
-
-@schedule(cron_schedule="0 9 * * 0", job=implnet_job_manitoba, execution_timezone="US/Central")
-def implnet_sch_manitoba(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py
deleted file mode 100644
index 7c85eaeb..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_matcommons import implnet_job_matcommons
-
-@schedule(cron_schedule="0 15 * * 0", job=implnet_job_matcommons, execution_timezone="US/Central")
-def implnet_sch_matcommons(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py
deleted file mode 100644
index 25197950..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_mdf import implnet_job_mdf
-
-@schedule(cron_schedule="0 18 * * 0", job=implnet_job_mdf, execution_timezone="US/Central")
-def implnet_sch_mdf(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py
deleted file mode 100644
index 06072f0b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_milano import implnet_job_milano
-
-@schedule(cron_schedule="0 12 * * 0", job=implnet_job_milano, execution_timezone="US/Central")
-def implnet_sch_milano(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py
deleted file mode 100644
index 5a26d9cc..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_neon import implnet_job_neon
-
-@schedule(cron_schedule="0 21 * * 0", job=implnet_job_neon, execution_timezone="US/Central")
-def implnet_sch_neon(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py
deleted file mode 100644
index 1711cde9..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_netherland import implnet_job_netherland
-
-@schedule(cron_schedule="0 0 * * 3", job=implnet_job_netherland, execution_timezone="US/Central")
-def implnet_sch_netherland(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py
deleted file mode 100644
index de7e3899..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_nioz import implnet_job_nioz
-
-@schedule(cron_schedule="0 15 * * 5", job=implnet_job_nioz, execution_timezone="US/Central")
-def implnet_sch_nioz(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py
deleted file mode 100644
index 29fe289b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_norway import implnet_job_norway
-
-@schedule(cron_schedule="0 3 * * 3", job=implnet_job_norway, execution_timezone="US/Central")
-def implnet_sch_norway(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py
deleted file mode 100644
index 2aa2a95c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ntu import implnet_job_ntu
-
-@schedule(cron_schedule="0 6 * * 3", job=implnet_job_ntu, execution_timezone="US/Central")
-def implnet_sch_ntu(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py
deleted file mode 100644
index d803ed4d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ofd import implnet_job_ofd
-
-@schedule(cron_schedule="0 21 * * 5", job=implnet_job_ofd, execution_timezone="US/Central")
-def implnet_sch_ofd(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py
deleted file mode 100644
index 6306642b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_peking import implnet_job_peking
-
-@schedule(cron_schedule="0 0 * * 6", job=implnet_job_peking, execution_timezone="US/Central")
-def implnet_sch_peking(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py
deleted file mode 100644
index e4f2d6df..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_pesquisa import implnet_job_pesquisa
-
-@schedule(cron_schedule="0 15 * * 6", job=implnet_job_pesquisa, execution_timezone="US/Central")
-def implnet_sch_pesquisa(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py
deleted file mode 100644
index 2718840b..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_pucdp import implnet_job_pucdp
-
-@schedule(cron_schedule="0 3 * * 6", job=implnet_job_pucdp, execution_timezone="US/Central")
-def implnet_sch_pucdp(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py
deleted file mode 100644
index 8969f6ed..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_qdr import implnet_job_qdr
-
-@schedule(cron_schedule="0 6 * * 6", job=implnet_job_qdr, execution_timezone="US/Central")
-def implnet_sch_qdr(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py
deleted file mode 100644
index 25383578..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_rin import implnet_job_rin
-
-@schedule(cron_schedule="0 6 * * 5", job=implnet_job_rin, execution_timezone="US/Central")
-def implnet_sch_rin(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py
deleted file mode 100644
index 9fa8e0d0..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_rosario import implnet_job_rosario
-
-@schedule(cron_schedule="0 12 * * 6", job=implnet_job_rosario, execution_timezone="US/Central")
-def implnet_sch_rosario(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py
deleted file mode 100644
index 63021fe0..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_rsu import implnet_job_rsu
-
-@schedule(cron_schedule="0 18 * * 6", job=implnet_job_rsu, execution_timezone="US/Central")
-def implnet_sch_rsu(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py
deleted file mode 100644
index 69537d9c..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_sceincespo import implnet_job_sceincespo
-
-@schedule(cron_schedule="0 18 * * 2", job=implnet_job_sceincespo, execution_timezone="US/Central")
-def implnet_sch_sceincespo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py
deleted file mode 100644
index e836b9c6..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_tdi import implnet_job_tdi
-
-@schedule(cron_schedule="0 9 * * 2", job=implnet_job_tdi, execution_timezone="US/Central")
-def implnet_sch_tdi(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py
deleted file mode 100644
index cfacd56d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_tdl import implnet_job_tdl
-
-@schedule(cron_schedule="0 21 * * 6", job=implnet_job_tdl, execution_timezone="US/Central")
-def implnet_sch_tdl(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py
deleted file mode 100644
index e624cb9d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ucdl import implnet_job_ucdl
-
-@schedule(cron_schedule="0 18 * * 5", job=implnet_job_ucdl, execution_timezone="US/Central")
-def implnet_sch_ucdl(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py
deleted file mode 100644
index f41b3af2..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ucla import implnet_job_ucla
-
-@schedule(cron_schedule="0 0 * * 0", job=implnet_job_ucla, execution_timezone="US/Central")
-def implnet_sch_ucla(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py
deleted file mode 100644
index ff7a9689..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_unb import implnet_job_unb
-
-@schedule(cron_schedule="0 3 * * 0", job=implnet_job_unb, execution_timezone="US/Central")
-def implnet_sch_unb(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py
deleted file mode 100644
index 68693458..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_unc import implnet_job_unc
-
-@schedule(cron_schedule="0 6 * * 0", job=implnet_job_unc, execution_timezone="US/Central")
-def implnet_sch_unc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py
deleted file mode 100644
index 263453bd..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_uva import implnet_job_uva
-
-@schedule(cron_schedule="0 3 * * 5", job=implnet_job_uva, execution_timezone="US/Central")
-def implnet_sch_uva(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py
deleted file mode 100644
index 63bd54b4..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_uwi import implnet_job_uwi
-
-@schedule(cron_schedule="0 15 * * 0", job=implnet_job_uwi, execution_timezone="US/Central")
-def implnet_sch_uwi(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py
deleted file mode 100644
index 4cb97dfa..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_vtti import implnet_job_vtti
-
-@schedule(cron_schedule="0 18 * * 0", job=implnet_job_vtti, execution_timezone="US/Central")
-def implnet_sch_vtti(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py
deleted file mode 100644
index 0f7fe00f..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_wardr import implnet_job_wardr
-
-@schedule(cron_schedule="0 21 * * 0", job=implnet_job_wardr, execution_timezone="US/Central")
-def implnet_sch_wardr(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py
deleted file mode 100644
index 4e29c86d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_yalenus import implnet_job_yalenus
-
-@schedule(cron_schedule="0 0 * * 1", job=implnet_job_yalenus, execution_timezone="US/Central")
-def implnet_sch_yalenus(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml
deleted file mode 100644
index 54490e1d..00000000
--- a/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-load_from:
- - python_file:
- relative_path: "repositories/repository.py"
- working_directory: .
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py
deleted file mode 100644
index 56965af5..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_africaioc import harvest_africaioc
-
-@job
-def implnet_job_africaioc():
- harvest_africaioc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py
deleted file mode 100644
index 8d384135..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_aquadocs import harvest_aquadocs
-
-@job
-def implnet_job_aquadocs():
- harvest_aquadocs()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py
deleted file mode 100644
index 650fe167..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_benguelacc import harvest_benguelacc
-
-@job
-def implnet_job_benguelacc():
- harvest_benguelacc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py
deleted file mode 100644
index 2761d08d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_caribbeanmarineatlas import harvest_caribbeanmarineatlas
-
-@job
-def implnet_job_caribbeanmarineatlas():
- harvest_caribbeanmarineatlas()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py
deleted file mode 100644
index b16b90cd..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_cioos import harvest_cioos
-
-@job
-def implnet_job_cioos():
- harvest_cioos()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py
deleted file mode 100644
index 3415a757..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_edmerp import harvest_edmerp
-
-@job
-def implnet_job_edmerp():
- harvest_edmerp()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py
deleted file mode 100644
index bc379313..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_edmo import harvest_edmo
-
-@job
-def implnet_job_edmo():
- harvest_edmo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py
deleted file mode 100644
index 34329714..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_emodnet import harvest_emodnet
-
-@job
-def implnet_job_emodnet():
- harvest_emodnet()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py
deleted file mode 100644
index a407682d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceanevents import harvest_euroceanevents
-
-@job
-def implnet_job_euroceanevents():
- harvest_euroceanevents()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py
deleted file mode 100644
index 8e937e18..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceanexperts import harvest_euroceanexperts
-
-@job
-def implnet_job_euroceanexperts():
- harvest_euroceanexperts()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py
deleted file mode 100644
index 243f18df..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceaninstitutions import harvest_euroceaninstitutions
-
-@job
-def implnet_job_euroceaninstitutions():
- harvest_euroceaninstitutions()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py
deleted file mode 100644
index e5b703f8..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceanorgs import harvest_euroceanorgs
-
-@job
-def implnet_job_euroceanorgs():
- harvest_euroceanorgs()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py
deleted file mode 100644
index d96782bd..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceanprojects import harvest_euroceanprojects
-
-@job
-def implnet_job_euroceanprojects():
- harvest_euroceanprojects()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py
deleted file mode 100644
index 3c0a1903..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceantraining import harvest_euroceantraining
-
-@job
-def implnet_job_euroceantraining():
- harvest_euroceantraining()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py
deleted file mode 100644
index e21b1990..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_euroceanvessels import harvest_euroceanvessels
-
-@job
-def implnet_job_euroceanvessels():
- harvest_euroceanvessels()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py
deleted file mode 100644
index e432faed..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_inanodc import harvest_inanodc
-
-@job
-def implnet_job_inanodc():
- harvest_inanodc()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py
deleted file mode 100644
index 35197b05..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_invemardocuments import harvest_invemardocuments
-
-@job
-def implnet_job_invemardocuments():
- harvest_invemardocuments()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py
deleted file mode 100644
index 32e94d0c..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_invemarexperts import harvest_invemarexperts
-
-@job
-def implnet_job_invemarexperts():
- harvest_invemarexperts()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py
deleted file mode 100644
index 92cf2724..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_invemarinstitutions import harvest_invemarinstitutions
-
-@job
-def implnet_job_invemarinstitutions():
- harvest_invemarinstitutions()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py
deleted file mode 100644
index efb05401..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_invemartraining import harvest_invemartraining
-
-@job
-def implnet_job_invemartraining():
- harvest_invemartraining()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py
deleted file mode 100644
index 356bdf6d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_invemarvessels import harvest_invemarvessels
-
-@job
-def implnet_job_invemarvessels():
- harvest_invemarvessels()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py
deleted file mode 100644
index ecf79a91..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_marinetraining import harvest_marinetraining
-
-@job
-def implnet_job_marinetraining():
- harvest_marinetraining()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py
deleted file mode 100644
index 7352ae89..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_maspawio import harvest_maspawio
-
-@job
-def implnet_job_maspawio():
- harvest_maspawio()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py
deleted file mode 100644
index e04416ef..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_obis import harvest_obis
-
-@job
-def implnet_job_obis():
- harvest_obis()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py
deleted file mode 100644
index 6477fe4a..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_obps import harvest_obps
-
-@job
-def implnet_job_obps():
- harvest_obps()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py
deleted file mode 100644
index ea8d50ef..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_oceanexperts import harvest_oceanexperts
-
-@job
-def implnet_job_oceanexperts():
- harvest_oceanexperts()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py
deleted file mode 100644
index 2b842fbc..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_oceanscape import harvest_oceanscape
-
-@job
-def implnet_job_oceanscape():
- harvest_oceanscape()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py
deleted file mode 100644
index 3e123926..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_pdh import harvest_pdh
-
-@job
-def implnet_job_pdh():
- harvest_pdh()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py
deleted file mode 100644
index 5feb230e..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_pogo import harvest_pogo
-
-@job
-def implnet_job_pogo():
- harvest_pogo()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py
deleted file mode 100644
index 92541474..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def africaioc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "africaioc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def africaioc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "africaioc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def africaioc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "africaioc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def africaioc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "africaioc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def africaioc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "africaioc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_africaioc():
- harvest = africaioc_gleaner()
- load1 = africaioc_nabu(harvest)
- load2 = africaioc_nabuprov(load1)
- load3 = africaioc_nabuorg(load2)
- load4 = africaioc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py
deleted file mode 100644
index 2617335b..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def aquadocs_gleaner(context):
- returned_value = gleanerio(("gleaner"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def aquadocs_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aquadocs_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aquadocs_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def aquadocs_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "aquadocs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_aquadocs():
- harvest = aquadocs_gleaner()
- load1 = aquadocs_nabu(harvest)
- load2 = aquadocs_nabuprov(load1)
- load3 = aquadocs_nabuorg(load2)
- load4 = aquadocs_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py
deleted file mode 100644
index 4167640d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def benguelacc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "benguelacc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def benguelacc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "benguelacc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def benguelacc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "benguelacc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def benguelacc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "benguelacc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def benguelacc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "benguelacc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_benguelacc():
- harvest = benguelacc_gleaner()
- load1 = benguelacc_nabu(harvest)
- load2 = benguelacc_nabuprov(load1)
- load3 = benguelacc_nabuorg(load2)
- load4 = benguelacc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py
deleted file mode 100644
index 3b8f0c41..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def caribbeanmarineatlas_gleaner(context):
- returned_value = gleanerio(("gleaner"), "caribbeanmarineatlas")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def caribbeanmarineatlas_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "caribbeanmarineatlas")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def caribbeanmarineatlas_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "caribbeanmarineatlas")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def caribbeanmarineatlas_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "caribbeanmarineatlas")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def caribbeanmarineatlas_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "caribbeanmarineatlas")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_caribbeanmarineatlas():
- harvest = caribbeanmarineatlas_gleaner()
- load1 = caribbeanmarineatlas_nabu(harvest)
- load2 = caribbeanmarineatlas_nabuprov(load1)
- load3 = caribbeanmarineatlas_nabuorg(load2)
- load4 = caribbeanmarineatlas_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py
deleted file mode 100644
index cb156905..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def cioos_gleaner(context):
- returned_value = gleanerio(("gleaner"), "cioos")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def cioos_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "cioos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cioos_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "cioos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cioos_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "cioos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def cioos_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "cioos")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_cioos():
- harvest = cioos_gleaner()
- load1 = cioos_nabu(harvest)
- load2 = cioos_nabuprov(load1)
- load3 = cioos_nabuorg(load2)
- load4 = cioos_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py
deleted file mode 100644
index 31ee259c..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def edmerp_gleaner(context):
- returned_value = gleanerio(("gleaner"), "edmerp")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def edmerp_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "edmerp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmerp_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "edmerp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmerp_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "edmerp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmerp_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "edmerp")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_edmerp():
- harvest = edmerp_gleaner()
- load1 = edmerp_nabu(harvest)
- load2 = edmerp_nabuprov(load1)
- load3 = edmerp_nabuorg(load2)
- load4 = edmerp_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py
deleted file mode 100644
index 706e1b60..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def edmo_gleaner(context):
- returned_value = gleanerio(("gleaner"), "edmo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def edmo_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "edmo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmo_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "edmo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmo_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "edmo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def edmo_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "edmo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_edmo():
- harvest = edmo_gleaner()
- load1 = edmo_nabu(harvest)
- load2 = edmo_nabuprov(load1)
- load3 = edmo_nabuorg(load2)
- load4 = edmo_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py
deleted file mode 100644
index ffe2953a..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def emodnet_gleaner(context):
- returned_value = gleanerio(("gleaner"), "emodnet")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def emodnet_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "emodnet")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def emodnet_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "emodnet")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def emodnet_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "emodnet")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def emodnet_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "emodnet")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_emodnet():
- harvest = emodnet_gleaner()
- load1 = emodnet_nabu(harvest)
- load2 = emodnet_nabuprov(load1)
- load3 = emodnet_nabuorg(load2)
- load4 = emodnet_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py
deleted file mode 100644
index 40b74e75..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceanevents_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceanevents")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceanevents_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceanevents")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanevents_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceanevents")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanevents_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceanevents")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanevents_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceanevents")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceanevents():
- harvest = euroceanevents_gleaner()
- load1 = euroceanevents_nabu(harvest)
- load2 = euroceanevents_nabuprov(load1)
- load3 = euroceanevents_nabuorg(load2)
- load4 = euroceanevents_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py
deleted file mode 100644
index 0f30c008..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceanexperts_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceanexperts")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceanexperts_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanexperts_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanexperts_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanexperts_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceanexperts():
- harvest = euroceanexperts_gleaner()
- load1 = euroceanexperts_nabu(harvest)
- load2 = euroceanexperts_nabuprov(load1)
- load3 = euroceanexperts_nabuorg(load2)
- load4 = euroceanexperts_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py
deleted file mode 100644
index abb012d7..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceaninstitutions_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceaninstitutions")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceaninstitutions_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceaninstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceaninstitutions_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceaninstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceaninstitutions_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceaninstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceaninstitutions_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceaninstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceaninstitutions():
- harvest = euroceaninstitutions_gleaner()
- load1 = euroceaninstitutions_nabu(harvest)
- load2 = euroceaninstitutions_nabuprov(load1)
- load3 = euroceaninstitutions_nabuorg(load2)
- load4 = euroceaninstitutions_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py
deleted file mode 100644
index 1e221581..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceanorgs_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceanorgs")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceanorgs_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceanorgs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanorgs_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceanorgs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanorgs_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceanorgs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanorgs_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceanorgs")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceanorgs():
- harvest = euroceanorgs_gleaner()
- load1 = euroceanorgs_nabu(harvest)
- load2 = euroceanorgs_nabuprov(load1)
- load3 = euroceanorgs_nabuorg(load2)
- load4 = euroceanorgs_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py
deleted file mode 100644
index d76a388e..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceanprojects_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceanprojects")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceanprojects_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceanprojects")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanprojects_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceanprojects")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanprojects_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceanprojects")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanprojects_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceanprojects")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceanprojects():
- harvest = euroceanprojects_gleaner()
- load1 = euroceanprojects_nabu(harvest)
- load2 = euroceanprojects_nabuprov(load1)
- load3 = euroceanprojects_nabuorg(load2)
- load4 = euroceanprojects_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py
deleted file mode 100644
index 89e918ec..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceantraining_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceantraining")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceantraining_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceantraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceantraining_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceantraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceantraining_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceantraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceantraining_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceantraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceantraining():
- harvest = euroceantraining_gleaner()
- load1 = euroceantraining_nabu(harvest)
- load2 = euroceantraining_nabuprov(load1)
- load3 = euroceantraining_nabuorg(load2)
- load4 = euroceantraining_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py
deleted file mode 100644
index 6691f026..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def euroceanvessels_gleaner(context):
- returned_value = gleanerio(("gleaner"), "euroceanvessels")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def euroceanvessels_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "euroceanvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanvessels_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "euroceanvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanvessels_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "euroceanvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def euroceanvessels_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "euroceanvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_euroceanvessels():
- harvest = euroceanvessels_gleaner()
- load1 = euroceanvessels_nabu(harvest)
- load2 = euroceanvessels_nabuprov(load1)
- load3 = euroceanvessels_nabuorg(load2)
- load4 = euroceanvessels_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py
deleted file mode 100644
index feb61b6c..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def inanodc_gleaner(context):
- returned_value = gleanerio(("gleaner"), "inanodc")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def inanodc_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "inanodc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def inanodc_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "inanodc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def inanodc_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "inanodc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def inanodc_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "inanodc")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_inanodc():
- harvest = inanodc_gleaner()
- load1 = inanodc_nabu(harvest)
- load2 = inanodc_nabuprov(load1)
- load3 = inanodc_nabuorg(load2)
- load4 = inanodc_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py
deleted file mode 100644
index 0a3131bb..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def invemardocuments_gleaner(context):
- returned_value = gleanerio(("gleaner"), "invemardocuments")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def invemardocuments_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "invemardocuments")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemardocuments_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "invemardocuments")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemardocuments_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "invemardocuments")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemardocuments_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "invemardocuments")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_invemardocuments():
- harvest = invemardocuments_gleaner()
- load1 = invemardocuments_nabu(harvest)
- load2 = invemardocuments_nabuprov(load1)
- load3 = invemardocuments_nabuorg(load2)
- load4 = invemardocuments_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py
deleted file mode 100644
index 883bdef1..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def invemarexperts_gleaner(context):
- returned_value = gleanerio(("gleaner"), "invemarexperts")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def invemarexperts_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "invemarexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarexperts_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "invemarexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarexperts_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "invemarexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarexperts_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "invemarexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_invemarexperts():
- harvest = invemarexperts_gleaner()
- load1 = invemarexperts_nabu(harvest)
- load2 = invemarexperts_nabuprov(load1)
- load3 = invemarexperts_nabuorg(load2)
- load4 = invemarexperts_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py
deleted file mode 100644
index c3dd3730..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def invemarinstitutions_gleaner(context):
- returned_value = gleanerio(("gleaner"), "invemarinstitutions")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def invemarinstitutions_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "invemarinstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarinstitutions_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "invemarinstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarinstitutions_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "invemarinstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarinstitutions_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "invemarinstitutions")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_invemarinstitutions():
- harvest = invemarinstitutions_gleaner()
- load1 = invemarinstitutions_nabu(harvest)
- load2 = invemarinstitutions_nabuprov(load1)
- load3 = invemarinstitutions_nabuorg(load2)
- load4 = invemarinstitutions_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py
deleted file mode 100644
index 2e4b6e19..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def invemartraining_gleaner(context):
- returned_value = gleanerio(("gleaner"), "invemartraining")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def invemartraining_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "invemartraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemartraining_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "invemartraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemartraining_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "invemartraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemartraining_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "invemartraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_invemartraining():
- harvest = invemartraining_gleaner()
- load1 = invemartraining_nabu(harvest)
- load2 = invemartraining_nabuprov(load1)
- load3 = invemartraining_nabuorg(load2)
- load4 = invemartraining_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py
deleted file mode 100644
index 50bad4c5..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def invemarvessels_gleaner(context):
- returned_value = gleanerio(("gleaner"), "invemarvessels")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def invemarvessels_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "invemarvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarvessels_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "invemarvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarvessels_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "invemarvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def invemarvessels_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "invemarvessels")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_invemarvessels():
- harvest = invemarvessels_gleaner()
- load1 = invemarvessels_nabu(harvest)
- load2 = invemarvessels_nabuprov(load1)
- load3 = invemarvessels_nabuorg(load2)
- load4 = invemarvessels_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py
deleted file mode 100644
index 6ff02664..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def marinetraining_gleaner(context):
- returned_value = gleanerio(("gleaner"), "marinetraining")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def marinetraining_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "marinetraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def marinetraining_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "marinetraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def marinetraining_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "marinetraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def marinetraining_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "marinetraining")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_marinetraining():
- harvest = marinetraining_gleaner()
- load1 = marinetraining_nabu(harvest)
- load2 = marinetraining_nabuprov(load1)
- load3 = marinetraining_nabuorg(load2)
- load4 = marinetraining_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py
deleted file mode 100644
index 03ecea27..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def maspawio_gleaner(context):
- returned_value = gleanerio(("gleaner"), "maspawio")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def maspawio_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "maspawio")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def maspawio_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "maspawio")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def maspawio_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "maspawio")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def maspawio_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "maspawio")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_maspawio():
- harvest = maspawio_gleaner()
- load1 = maspawio_nabu(harvest)
- load2 = maspawio_nabuprov(load1)
- load3 = maspawio_nabuorg(load2)
- load4 = maspawio_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py
deleted file mode 100644
index d8441180..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def obis_gleaner(context):
- returned_value = gleanerio(("gleaner"), "obis")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def obis_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "obis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obis_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "obis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obis_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "obis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obis_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "obis")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_obis():
- harvest = obis_gleaner()
- load1 = obis_nabu(harvest)
- load2 = obis_nabuprov(load1)
- load3 = obis_nabuorg(load2)
- load4 = obis_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py
deleted file mode 100644
index 8de17c32..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def obps_gleaner(context):
- returned_value = gleanerio(("gleaner"), "obps")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def obps_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "obps")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obps_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "obps")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obps_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "obps")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def obps_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "obps")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_obps():
- harvest = obps_gleaner()
- load1 = obps_nabu(harvest)
- load2 = obps_nabuprov(load1)
- load3 = obps_nabuorg(load2)
- load4 = obps_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py
deleted file mode 100644
index 576c1823..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def oceanexperts_gleaner(context):
- returned_value = gleanerio(("gleaner"), "oceanexperts")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def oceanexperts_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "oceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanexperts_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "oceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanexperts_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "oceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanexperts_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "oceanexperts")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_oceanexperts():
- harvest = oceanexperts_gleaner()
- load1 = oceanexperts_nabu(harvest)
- load2 = oceanexperts_nabuprov(load1)
- load3 = oceanexperts_nabuorg(load2)
- load4 = oceanexperts_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py
deleted file mode 100644
index ded284fe..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def oceanscape_gleaner(context):
- returned_value = gleanerio(("gleaner"), "oceanscape")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def oceanscape_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "oceanscape")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanscape_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "oceanscape")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanscape_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "oceanscape")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def oceanscape_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "oceanscape")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_oceanscape():
- harvest = oceanscape_gleaner()
- load1 = oceanscape_nabu(harvest)
- load2 = oceanscape_nabuprov(load1)
- load3 = oceanscape_nabuorg(load2)
- load4 = oceanscape_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py
deleted file mode 100644
index 4e754958..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def pdh_gleaner(context):
- returned_value = gleanerio(("gleaner"), "pdh")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def pdh_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "pdh")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pdh_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "pdh")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pdh_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "pdh")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pdh_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "pdh")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_pdh():
- harvest = pdh_gleaner()
- load1 = pdh_nabu(harvest)
- load2 = pdh_nabuprov(load1)
- load3 = pdh_nabuorg(load2)
- load4 = pdh_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py
deleted file mode 100644
index 4511e7f8..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from dagster import op, graph, get_dagster_logger
-import subprocess
-import os, json, io
-import urllib
-from urllib import request
-from dagster import job, op, get_dagster_logger
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-
-# Vars and Envs
-
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-MINIO_URL = os.environ.get('GLEANER_MINIO_URL')
-MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL')
-MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET')
-MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY')
-MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-
-
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
- try:
- data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT')
- client = Minio(
- server,
- secure=False,
- access_key=os.environ.get('GLEANER_MINIO_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'),
- objPrefix,
- io.BytesIO(data),
- len(data))
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-
-
-def gleanerio(mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Create: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"]
- NAME = "gleaner01_" + source
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source]
- NAME = "nabu01_" + source
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- data = {}
- data["Image"] = IMAGE
- data["Cmd"] = CMD
-
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
- enva = []
- enva.append(str("MINIO_URL={}".format(MINIO_URL)))
- enva.append(str("MINIO_PORT={}".format(MINIO_PORT)))
- enva.append(str("MINIO_SSL={}".format(MINIO_SSL)))
- enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET)))
- enva.append(str("MINIO_KEY={}".format(MINIO_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET)))
-
- data["Env"] = enva
-
- url = URL + 'containers/create'
- params = {
- "name": NAME
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- get_dagster_logger().info(f"URL: {str(url)}")
-
- req = request.Request(url, str.encode(json.dumps(data)))
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- c = r.read()
- d = json.loads(c)
- cid = d['Id']
-
- print(r.status)
- get_dagster_logger().info(f"Create: {str(r.status)}")
-
- # print(cid)
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': ARCHIVE_PATH
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
-
- # DATA = read_file_bytestream(ARCHIVE_FILE)
- DATA = s3reader(ARCHIVE_FILE)
-
- req = request.Request(url, data=DATA, method="PUT")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- print(r.status)
- get_dagster_logger().info(f"Archive: {str(r.status)}")
-
- # c = r.read()
- # print(c)
- # d = json.loads(c)
- # print(d)
-
- ## ------------ Start
-
- url = URL + 'containers/' + cid + '/start'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Start: {str(r.status)}")
-
- ## ------------ Wait expect 200
-
- url = URL + 'containers/' + cid + '/wait'
- req = request.Request(url, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Wait: {str(r.status)}")
-
- ## ------------ Copy logs expect 200
-
- url = URL + 'containers/' + cid + '/logs'
- params = {
- 'stdout': 'true',
- 'stderr': 'false'
- }
- query_string = urllib.parse.urlencode(params)
-
- url = url + "?" + query_string
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- c = r.read()
-
- # write to file
- # f = open(LOGFILE, 'w')
- # f.write(str(c))
- # f.close()
-
- # write to s3
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
-
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"Logs: {str(r.status)}")
-
- ## ------------ Remove expect 204
-
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Remove: {str(r.status)}")
-
- return 0
-
-@op
-def pogo_gleaner(context):
- returned_value = gleanerio(("gleaner"), "pogo")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def pogo_nabu(context, msg: str):
- returned_value = gleanerio(("nabu"), "pogo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pogo_nabuprov(context, msg: str):
- returned_value = gleanerio(("prov"), "pogo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pogo_nabuorg(context, msg: str):
- returned_value = gleanerio(("orgs"), "pogo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def pogo_naburelease(context, msg: str):
- returned_value = gleanerio(("release"), "pogo")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@graph
-def harvest_pogo():
- harvest = pogo_gleaner()
- load1 = pogo_nabu(harvest)
- load2 = pogo_nabuprov(load1)
- load3 = pogo_nabuorg(load2)
- load4 = pogo_naburelease(load3)
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py
deleted file mode 100644
index f7f74595..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py
+++ /dev/null
@@ -1,371 +0,0 @@
-from dagster import repository
-from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20
-from sch.implnet_sch_nwisgw20 import implnet_sch_nwisgw20
-from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22
-from sch.implnet_sch_nwisgw22 import implnet_sch_nwisgw22
-from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16
-from sch.implnet_sch_nwisgw16 import implnet_sch_nwisgw16
-from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12
-from sch.implnet_sch_nwisgw12 import implnet_sch_nwisgw12
-from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25
-from sch.implnet_sch_nwisgw25 import implnet_sch_nwisgw25
-from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14
-from sch.implnet_sch_nwisgw14 import implnet_sch_nwisgw14
-from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23
-from sch.implnet_sch_nwisgw23 import implnet_sch_nwisgw23
-from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10
-from sch.implnet_sch_nwisgw10 import implnet_sch_nwisgw10
-from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15
-from sch.implnet_sch_nwisgw15 import implnet_sch_nwisgw15
-from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2
-from sch.implnet_sch_nwisgw2 import implnet_sch_nwisgw2
-from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24
-from sch.implnet_sch_nwisgw24 import implnet_sch_nwisgw24
-from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9
-from sch.implnet_sch_nwisgw9 import implnet_sch_nwisgw9
-from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19
-from sch.implnet_sch_nwisgw19 import implnet_sch_nwisgw19
-from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28
-from sch.implnet_sch_nwisgw28 import implnet_sch_nwisgw28
-from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26
-from sch.implnet_sch_nwisgw26 import implnet_sch_nwisgw26
-from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5
-from sch.implnet_sch_nwisgw5 import implnet_sch_nwisgw5
-from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13
-from sch.implnet_sch_nwisgw13 import implnet_sch_nwisgw13
-from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6
-from sch.implnet_sch_nwisgw6 import implnet_sch_nwisgw6
-from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3
-from sch.implnet_sch_nwisgw3 import implnet_sch_nwisgw3
-from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4
-from sch.implnet_sch_nwisgw4 import implnet_sch_nwisgw4
-from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1
-from sch.implnet_sch_nwisgw1 import implnet_sch_nwisgw1
-from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21
-from sch.implnet_sch_nwisgw21 import implnet_sch_nwisgw21
-from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27
-from sch.implnet_sch_nwisgw27 import implnet_sch_nwisgw27
-from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8
-from sch.implnet_sch_nwisgw8 import implnet_sch_nwisgw8
-from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17
-from sch.implnet_sch_nwisgw17 import implnet_sch_nwisgw17
-from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18
-from sch.implnet_sch_nwisgw18 import implnet_sch_nwisgw18
-from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7
-from sch.implnet_sch_nwisgw7 import implnet_sch_nwisgw7
-from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11
-from sch.implnet_sch_nwisgw11 import implnet_sch_nwisgw11
-from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0
-from sch.implnet_sch_nwisgw0 import implnet_sch_nwisgw0
-from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1
-from sch.implnet_sch_nwissite1 import implnet_sch_nwissite1
-from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3
-from sch.implnet_sch_nwissite3 import implnet_sch_nwissite3
-from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0
-from sch.implnet_sch_nwissite0 import implnet_sch_nwissite0
-from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2
-from sch.implnet_sch_nwissite2 import implnet_sch_nwissite2
-from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1
-from sch.implnet_sch_gfv11pois1 import implnet_sch_gfv11pois1
-from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0
-from sch.implnet_sch_gfv11pois0 import implnet_sch_gfv11pois0
-from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0
-from sch.implnet_sch_hydrologicunit0 import implnet_sch_hydrologicunit0
-from jobs.implnet_jobs_damspids0 import implnet_job_damspids0
-from sch.implnet_sch_damspids0 import implnet_sch_damspids0
-from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0
-from sch.implnet_sch_cuahsihishydrodataczhrids0 import implnet_sch_cuahsihishydrodataczhrids0
-from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0
-from sch.implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 import implnet_sch_cuahsihisnooksackmicroclimatenetworkids0
-from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0
-from sch.implnet_sch_cuahsihisneonids0 import implnet_sch_cuahsihisneonids0
-from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0
-from sch.implnet_sch_cuahsihisglobalriversobservatoryids0 import implnet_sch_cuahsihisglobalriversobservatoryids0
-from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0
-from sch.implnet_sch_cuahsihistncwaterdataids0 import implnet_sch_cuahsihistncwaterdataids0
-from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0
-from sch.implnet_sch_cuahsihisscotlandnwisids0 import implnet_sch_cuahsihisscotlandnwisids0
-from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0
-from sch.implnet_sch_cuahsihisczoboulderids0 import implnet_sch_cuahsihisczoboulderids0
-from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0
-from sch.implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 import implnet_sch_cuahsihisyosemitehydroclimatenetworkids0
-from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0
-from sch.implnet_sch_cuahsihismuddyriverids0 import implnet_sch_cuahsihismuddyriverids0
-from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0
-from sch.implnet_sch_cuahsihisczomercedids0 import implnet_sch_cuahsihisczomercedids0
-from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0
-from sch.implnet_sch_cuahsihisghcnids0 import implnet_sch_cuahsihisghcnids0
-from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0
-from sch.implnet_sch_cuahsihismmaatacamaids0 import implnet_sch_cuahsihismmaatacamaids0
-from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0
-from sch.implnet_sch_cuahsihisumbcwqids0 import implnet_sch_cuahsihisumbcwqids0
-from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0
-from sch.implnet_sch_cuahsihisgleonlakeannieids0 import implnet_sch_cuahsihisgleonlakeannieids0
-from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0
-from sch.implnet_sch_cuahsihisluwlids0 import implnet_sch_cuahsihisluwlids0
-from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0
-from sch.implnet_sch_cuahsihiscedarriverids0 import implnet_sch_cuahsihiscedarriverids0
-from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0
-from sch.implnet_sch_cuahsihisccbepdapids0 import implnet_sch_cuahsihisccbepdapids0
-from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0
-from sch.implnet_sch_cuahsihiskansasweatherdataids0 import implnet_sch_cuahsihiskansasweatherdataids0
-from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0
-from sch.implnet_sch_cuahsihisodmkentstateids0 import implnet_sch_cuahsihisodmkentstateids0
-from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0
-from sch.implnet_sch_cuahsihisgleondorsetids0 import implnet_sch_cuahsihisgleondorsetids0
-from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0
-from sch.implnet_sch_cuahsihisclarksburgspids0 import implnet_sch_cuahsihisclarksburgspids0
-from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0
-from sch.implnet_sch_cuahsihiscrwaids0 import implnet_sch_cuahsihiscrwaids0
-from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0
-from sch.implnet_sch_cuahsihiscuisoids0 import implnet_sch_cuahsihiscuisoids0
-from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0
-from sch.implnet_sch_cuahsihisprovorivergamutids0 import implnet_sch_cuahsihisprovorivergamutids0
-from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0
-from sch.implnet_sch_cuahsihisirwaids0 import implnet_sch_cuahsihisirwaids0
-from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0
-from sch.implnet_sch_cuahsihisczoluquilloids0 import implnet_sch_cuahsihisczoluquilloids0
-from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0
-from sch.implnet_sch_cuahsihistuolumnemdwids0 import implnet_sch_cuahsihistuolumnemdwids0
-from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0
-from sch.implnet_sch_cuahsihisrmblids0 import implnet_sch_cuahsihisrmblids0
-from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0
-from sch.implnet_sch_cuahsihispanolaodmids0 import implnet_sch_cuahsihispanolaodmids0
-from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0
-from sch.implnet_sch_cuahsihisnewnids0 import implnet_sch_cuahsihisnewnids0
-from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0
-from sch.implnet_sch_cuahsihisczoudelids0 import implnet_sch_cuahsihisczoudelids0
-from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0
-from sch.implnet_sch_cuahsihisfarmrwaids0 import implnet_sch_cuahsihisfarmrwaids0
-from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0
-from sch.implnet_sch_cuahsihisskcmilltownids0 import implnet_sch_cuahsihisskcmilltownids0
-from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0
-from sch.implnet_sch_cuahsihisumbcgwids0 import implnet_sch_cuahsihisumbcgwids0
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0
-from sch.implnet_sch_cuahsihisshalenetworkodmids0 import implnet_sch_cuahsihisshalenetworkodmids0
-from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0
-from sch.implnet_sch_cuahsihisnevadosids0 import implnet_sch_cuahsihisnevadosids0
-from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0
-from sch.implnet_sch_cuahsihisweiherbachids0 import implnet_sch_cuahsihisweiherbachids0
-from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0
-from sch.implnet_sch_cuahsihismazarriverprojectids0 import implnet_sch_cuahsihismazarriverprojectids0
-from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0
-from sch.implnet_sch_cuahsihisgleonsunapeeids0 import implnet_sch_cuahsihisgleonsunapeeids0
-from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0
-from sch.implnet_sch_cuahsihisorsancohabids0 import implnet_sch_cuahsihisorsancohabids0
-from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0
-from sch.implnet_sch_cuahsihismwraids0 import implnet_sch_cuahsihismwraids0
-from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0
-from sch.implnet_sch_cuahsihismaaeriids0 import implnet_sch_cuahsihismaaeriids0
-from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0
-from sch.implnet_sch_cuahsihisnceiww2ids0 import implnet_sch_cuahsihisnceiww2ids0
-from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0
-from sch.implnet_sch_cuahsihistarlandwaterqualityids0 import implnet_sch_cuahsihistarlandwaterqualityids0
-from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0
-from sch.implnet_sch_cuahsihislczoodm2ids0 import implnet_sch_cuahsihislczoodm2ids0
-from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0
-from sch.implnet_sch_cuahsihiscocorahsids0 import implnet_sch_cuahsihiscocorahsids0
-from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0
-from sch.implnet_sch_cuahsihisparalanaturalezaids0 import implnet_sch_cuahsihisparalanaturalezaids0
-from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0
-from sch.implnet_sch_cuahsihisczocatalinaids0 import implnet_sch_cuahsihisczocatalinaids0
-from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0
-from sch.implnet_sch_cuahsihisieeratwilkesuniversityids0 import implnet_sch_cuahsihisieeratwilkesuniversityids0
-from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0
-from sch.implnet_sch_cuahsihismudlakeids0 import implnet_sch_cuahsihismudlakeids0
-from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0
-from sch.implnet_sch_cuahsihismwdisids0 import implnet_sch_cuahsihismwdisids0
-from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0
-from sch.implnet_sch_cuahsihisloganriverids0 import implnet_sch_cuahsihisloganriverids0
-from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0
-from sch.implnet_sch_cuahsihisscanids0 import implnet_sch_cuahsihisscanids0
-from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0
-from sch.implnet_sch_cuahsihisnashrwaids0 import implnet_sch_cuahsihisnashrwaids0
-from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0
-from sch.implnet_sch_cuahsihismobilecrowdhydrologyids0 import implnet_sch_cuahsihismobilecrowdhydrologyids0
-from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0
-from sch.implnet_sch_cuahsihisandrewsforestlterids0 import implnet_sch_cuahsihisandrewsforestlterids0
-from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0
-from sch.implnet_sch_cuahsihisloganrivergamutids0 import implnet_sch_cuahsihisloganrivergamutids0
-from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0
-from sch.implnet_sch_cuahsihislittlebearriverids0 import implnet_sch_cuahsihislittlebearriverids0
-from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0
-from sch.implnet_sch_cuahsihislterntlwoodruffids0 import implnet_sch_cuahsihislterntlwoodruffids0
-from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0
-from sch.implnet_sch_cuahsihissagehencreekids0 import implnet_sch_cuahsihissagehencreekids0
-from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1
-from sch.implnet_sch_cuahsihisshalenetworkodmids1 import implnet_sch_cuahsihisshalenetworkodmids1
-from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0
-from sch.implnet_sch_cuahsihisfrcwqmids0 import implnet_sch_cuahsihisfrcwqmids0
-from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0
-from sch.implnet_sch_cuahsihishydrodataczdids0 import implnet_sch_cuahsihishydrodataczdids0
-from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0
-from sch.implnet_sch_cuahsihisdrwiids0 import implnet_sch_cuahsihisdrwiids0
-from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0
-from sch.implnet_sch_cuahsihisubwpadids0 import implnet_sch_cuahsihisubwpadids0
-from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0
-from sch.implnet_sch_cuahsihistrwaids0 import implnet_sch_cuahsihistrwaids0
-from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0
-from sch.implnet_sch_cuahsihisredbuttecreekgamutids0 import implnet_sch_cuahsihisredbuttecreekgamutids0
-from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0
-from sch.implnet_sch_cuahsihisglacialridgeids0 import implnet_sch_cuahsihisglacialridgeids0
-from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0
-from sch.implnet_sch_cuahsihisfcelterids0 import implnet_sch_cuahsihisfcelterids0
-from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0
-from sch.implnet_sch_cuahsihisczoarizids0 import implnet_sch_cuahsihisczoarizids0
-from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0
-from sch.implnet_sch_cuahsihiscalvinhhsids0 import implnet_sch_cuahsihiscalvinhhsids0
-from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0
-from sch.implnet_sch_cuahsihissnotelids0 import implnet_sch_cuahsihissnotelids0
-from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0
-from sch.implnet_sch_cuahsihisnevcanids0 import implnet_sch_cuahsihisnevcanids0
-from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0
-from sch.implnet_sch_cuahsihisczopsuids0 import implnet_sch_cuahsihisczopsuids0
-from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0
-from sch.implnet_sch_cuahsihisbrazilucbids0 import implnet_sch_cuahsihisbrazilucbids0
-from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0
-from sch.implnet_sch_cuahsihisgleonauburnids0 import implnet_sch_cuahsihisgleonauburnids0
-from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0
-from sch.implnet_sch_cuahsihislaselvastreamdischargeids0 import implnet_sch_cuahsihislaselvastreamdischargeids0
-from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0
-from sch.implnet_sch_cuahsihisisbenaids0 import implnet_sch_cuahsihisisbenaids0
-from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0
-from sch.implnet_sch_cuahsihisswedishmonitoringdataids0 import implnet_sch_cuahsihisswedishmonitoringdataids0
-from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0
-from sch.implnet_sch_cuahsihisunhsnowids0 import implnet_sch_cuahsihisunhsnowids0
-from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0
-from sch.implnet_sch_cuahsihishassbergeids0 import implnet_sch_cuahsihishassbergeids0
-from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0
-from sch.implnet_sch_cuahsihisnhgswofids0 import implnet_sch_cuahsihisnhgswofids0
-from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0
-from sch.implnet_sch_cuahsihisgonggaids0 import implnet_sch_cuahsihisgonggaids0
-from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0
-from sch.implnet_sch_cuahsihismopexids0 import implnet_sch_cuahsihismopexids0
-from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0
-from sch.implnet_sch_cagagespids0 import implnet_sch_cagagespids0
-from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0
-from sch.implnet_sch_sechydrgreg0 import implnet_sch_sechydrgreg0
-from jobs.implnet_jobs_counties0 import implnet_job_counties0
-from sch.implnet_sch_counties0 import implnet_sch_counties0
-from jobs.implnet_jobs_pws0 import implnet_job_pws0
-from sch.implnet_sch_pws0 import implnet_sch_pws0
-from jobs.implnet_jobs_hu060 import implnet_job_hu060
-from sch.implnet_sch_hu060 import implnet_sch_hu060
-from jobs.implnet_jobs_nataq0 import implnet_job_nataq0
-from sch.implnet_sch_nataq0 import implnet_sch_nataq0
-from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0
-from sch.implnet_sch_cbsa0 import implnet_sch_cbsa0
-from jobs.implnet_jobs_hu080 import implnet_job_hu080
-from sch.implnet_sch_hu080 import implnet_sch_hu080
-from jobs.implnet_jobs_hu040 import implnet_job_hu040
-from sch.implnet_sch_hu040 import implnet_sch_hu040
-from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0
-from sch.implnet_sch_princiaq0 import implnet_sch_princiaq0
-from jobs.implnet_jobs_refgage0 import implnet_job_refgage0
-from sch.implnet_sch_refgage0 import implnet_sch_refgage0
-from jobs.implnet_jobs_refgage3 import implnet_job_refgage3
-from sch.implnet_sch_refgage3 import implnet_sch_refgage3
-from jobs.implnet_jobs_refgage2 import implnet_job_refgage2
-from sch.implnet_sch_refgage2 import implnet_sch_refgage2
-from jobs.implnet_jobs_refgage1 import implnet_job_refgage1
-from sch.implnet_sch_refgage1 import implnet_sch_refgage1
-from jobs.implnet_jobs_dams0 import implnet_job_dams0
-from sch.implnet_sch_dams0 import implnet_sch_dams0
-from jobs.implnet_jobs_dams1 import implnet_job_dams1
-from sch.implnet_sch_dams1 import implnet_sch_dams1
-from jobs.implnet_jobs_ua100 import implnet_job_ua100
-from sch.implnet_sch_ua100 import implnet_sch_ua100
-from jobs.implnet_jobs_states0 import implnet_job_states0
-from sch.implnet_sch_states0 import implnet_sch_states0
-from jobs.implnet_jobs_hu100 import implnet_job_hu100
-from sch.implnet_sch_hu100 import implnet_sch_hu100
-from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0
-from sch.implnet_sch_aiannh0 import implnet_sch_aiannh0
-from jobs.implnet_jobs_hu020 import implnet_job_hu020
-from sch.implnet_sch_hu020 import implnet_sch_hu020
-from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0
-from sch.implnet_sch_mainstems0 import implnet_sch_mainstems0
-from jobs.implnet_jobs_places0 import implnet_job_places0
-from sch.implnet_sch_places0 import implnet_sch_places0
-from jobs.implnet_jobs_hmw0 import implnet_job_hmw0
-from sch.implnet_sch_hmw0 import implnet_sch_hmw0
-from jobs.implnet_jobs_hmw1 import implnet_job_hmw1
-from sch.implnet_sch_hmw1 import implnet_sch_hmw1
-from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0
-from sch.implnet_sch_huc12pp0 import implnet_sch_huc12pp0
-from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1
-from sch.implnet_sch_huc12pp1 import implnet_sch_huc12pp1
-from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3
-from sch.implnet_sch_nmwdiose3 import implnet_sch_nmwdiose3
-from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2
-from sch.implnet_sch_nmwdiose2 import implnet_sch_nmwdiose2
-from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0
-from sch.implnet_sch_nmwdiose0 import implnet_sch_nmwdiose0
-from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4
-from sch.implnet_sch_nmwdiose4 import implnet_sch_nmwdiose4
-from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1
-from sch.implnet_sch_nmwdiose1 import implnet_sch_nmwdiose1
-from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0
-from sch.implnet_sch_nmwdist0 import implnet_sch_nmwdist0
-from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0
-from sch.implnet_sch_selfieids0 import implnet_sch_selfieids0
-from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0
-from sch.implnet_sch_chyldpilotids0 import implnet_sch_chyldpilotids0
-from jobs.implnet_jobs_rise0 import implnet_job_rise0
-from sch.implnet_sch_rise0 import implnet_sch_rise0
-from jobs.implnet_jobs_autotest10 import implnet_job_autotest10
-from sch.implnet_sch_autotest10 import implnet_sch_autotest10
-from jobs.implnet_jobs_links0 import implnet_job_links0
-from sch.implnet_sch_links0 import implnet_sch_links0
-from jobs.implnet_jobs_demo0 import implnet_job_demo0
-from sch.implnet_sch_demo0 import implnet_sch_demo0
-from jobs.implnet_jobs_autotest20 import implnet_job_autotest20
-from sch.implnet_sch_autotest20 import implnet_sch_autotest20
-from jobs.implnet_jobs_wade2 import implnet_job_wade2
-from sch.implnet_sch_wade2 import implnet_sch_wade2
-from jobs.implnet_jobs_wade0 import implnet_job_wade0
-from sch.implnet_sch_wade0 import implnet_sch_wade0
-from jobs.implnet_jobs_wade17 import implnet_job_wade17
-from sch.implnet_sch_wade17 import implnet_sch_wade17
-from jobs.implnet_jobs_wade9 import implnet_job_wade9
-from sch.implnet_sch_wade9 import implnet_sch_wade9
-from jobs.implnet_jobs_wade7 import implnet_job_wade7
-from sch.implnet_sch_wade7 import implnet_sch_wade7
-from jobs.implnet_jobs_wade3 import implnet_job_wade3
-from sch.implnet_sch_wade3 import implnet_sch_wade3
-from jobs.implnet_jobs_wade15 import implnet_job_wade15
-from sch.implnet_sch_wade15 import implnet_sch_wade15
-from jobs.implnet_jobs_wade5 import implnet_job_wade5
-from sch.implnet_sch_wade5 import implnet_sch_wade5
-from jobs.implnet_jobs_wade10 import implnet_job_wade10
-from sch.implnet_sch_wade10 import implnet_sch_wade10
-from jobs.implnet_jobs_wade14 import implnet_job_wade14
-from sch.implnet_sch_wade14 import implnet_sch_wade14
-from jobs.implnet_jobs_wade18 import implnet_job_wade18
-from sch.implnet_sch_wade18 import implnet_sch_wade18
-from jobs.implnet_jobs_wade13 import implnet_job_wade13
-from sch.implnet_sch_wade13 import implnet_sch_wade13
-from jobs.implnet_jobs_wade8 import implnet_job_wade8
-from sch.implnet_sch_wade8 import implnet_sch_wade8
-from jobs.implnet_jobs_wade19 import implnet_job_wade19
-from sch.implnet_sch_wade19 import implnet_sch_wade19
-from jobs.implnet_jobs_wade12 import implnet_job_wade12
-from sch.implnet_sch_wade12 import implnet_sch_wade12
-from jobs.implnet_jobs_wade4 import implnet_job_wade4
-from sch.implnet_sch_wade4 import implnet_sch_wade4
-from jobs.implnet_jobs_wade16 import implnet_job_wade16
-from sch.implnet_sch_wade16 import implnet_sch_wade16
-from jobs.implnet_jobs_wade1 import implnet_job_wade1
-from sch.implnet_sch_wade1 import implnet_sch_wade1
-from jobs.implnet_jobs_wade6 import implnet_job_wade6
-from sch.implnet_sch_wade6 import implnet_sch_wade6
-from jobs.implnet_jobs_wade11 import implnet_job_wade11
-from sch.implnet_sch_wade11 import implnet_sch_wade11
-
-@repository
-def gleaner():
- jobs = [implnet_job_nwisgw20, implnet_job_nwisgw22, implnet_job_nwisgw16, implnet_job_nwisgw12, implnet_job_nwisgw25, implnet_job_nwisgw14, implnet_job_nwisgw23, implnet_job_nwisgw10, implnet_job_nwisgw15, implnet_job_nwisgw2, implnet_job_nwisgw24, implnet_job_nwisgw9, implnet_job_nwisgw19, implnet_job_nwisgw28, implnet_job_nwisgw26, implnet_job_nwisgw5, implnet_job_nwisgw13, implnet_job_nwisgw6, implnet_job_nwisgw3, implnet_job_nwisgw4, implnet_job_nwisgw1, implnet_job_nwisgw21, implnet_job_nwisgw27, implnet_job_nwisgw8, implnet_job_nwisgw17, implnet_job_nwisgw18, implnet_job_nwisgw7, implnet_job_nwisgw11, implnet_job_nwisgw0, implnet_job_nwissite1, implnet_job_nwissite3, implnet_job_nwissite0, implnet_job_nwissite2, implnet_job_gfv11pois1, implnet_job_gfv11pois0, implnet_job_hydrologicunit0, implnet_job_damspids0, implnet_job_cuahsihishydrodataczhrids0, implnet_job_cuahsihisnooksackmicroclimatenetworkids0, implnet_job_cuahsihisneonids0, implnet_job_cuahsihisglobalriversobservatoryids0, implnet_job_cuahsihistncwaterdataids0, implnet_job_cuahsihisscotlandnwisids0, implnet_job_cuahsihisczoboulderids0, implnet_job_cuahsihisyosemitehydroclimatenetworkids0, implnet_job_cuahsihismuddyriverids0, implnet_job_cuahsihisczomercedids0, implnet_job_cuahsihisghcnids0, implnet_job_cuahsihismmaatacamaids0, implnet_job_cuahsihisumbcwqids0, implnet_job_cuahsihisgleonlakeannieids0, implnet_job_cuahsihisluwlids0, implnet_job_cuahsihiscedarriverids0, implnet_job_cuahsihisccbepdapids0, implnet_job_cuahsihiskansasweatherdataids0, implnet_job_cuahsihisodmkentstateids0, implnet_job_cuahsihisgleondorsetids0, implnet_job_cuahsihisclarksburgspids0, implnet_job_cuahsihiscrwaids0, implnet_job_cuahsihiscuisoids0, implnet_job_cuahsihisprovorivergamutids0, implnet_job_cuahsihisirwaids0, implnet_job_cuahsihisczoluquilloids0, implnet_job_cuahsihistuolumnemdwids0, implnet_job_cuahsihisrmblids0, implnet_job_cuahsihispanolaodmids0, implnet_job_cuahsihisnewnids0, implnet_job_cuahsihisczoudelids0, implnet_job_cuahsihisfarmrwaids0, implnet_job_cuahsihisskcmilltownids0, implnet_job_cuahsihisumbcgwids0, implnet_job_cuahsihisshalenetworkodmids0, implnet_job_cuahsihisnevadosids0, implnet_job_cuahsihisweiherbachids0, implnet_job_cuahsihismazarriverprojectids0, implnet_job_cuahsihisgleonsunapeeids0, implnet_job_cuahsihisorsancohabids0, implnet_job_cuahsihismwraids0, implnet_job_cuahsihismaaeriids0, implnet_job_cuahsihisnceiww2ids0, implnet_job_cuahsihistarlandwaterqualityids0, implnet_job_cuahsihislczoodm2ids0, implnet_job_cuahsihiscocorahsids0, implnet_job_cuahsihisparalanaturalezaids0, implnet_job_cuahsihisczocatalinaids0, implnet_job_cuahsihisieeratwilkesuniversityids0, implnet_job_cuahsihismudlakeids0, implnet_job_cuahsihismwdisids0, implnet_job_cuahsihisloganriverids0, implnet_job_cuahsihisscanids0, implnet_job_cuahsihisnashrwaids0, implnet_job_cuahsihismobilecrowdhydrologyids0, implnet_job_cuahsihisandrewsforestlterids0, implnet_job_cuahsihisloganrivergamutids0, implnet_job_cuahsihislittlebearriverids0, implnet_job_cuahsihislterntlwoodruffids0, implnet_job_cuahsihissagehencreekids0, implnet_job_cuahsihisshalenetworkodmids1, implnet_job_cuahsihisfrcwqmids0, implnet_job_cuahsihishydrodataczdids0, implnet_job_cuahsihisdrwiids0, implnet_job_cuahsihisubwpadids0, implnet_job_cuahsihistrwaids0, implnet_job_cuahsihisredbuttecreekgamutids0, implnet_job_cuahsihisglacialridgeids0, implnet_job_cuahsihisfcelterids0, implnet_job_cuahsihisczoarizids0, implnet_job_cuahsihiscalvinhhsids0, implnet_job_cuahsihissnotelids0, implnet_job_cuahsihisnevcanids0, implnet_job_cuahsihisczopsuids0, implnet_job_cuahsihisbrazilucbids0, implnet_job_cuahsihisgleonauburnids0, implnet_job_cuahsihislaselvastreamdischargeids0, implnet_job_cuahsihisisbenaids0, implnet_job_cuahsihisswedishmonitoringdataids0, implnet_job_cuahsihisunhsnowids0, implnet_job_cuahsihishassbergeids0, implnet_job_cuahsihisnhgswofids0, implnet_job_cuahsihisgonggaids0, implnet_job_cuahsihismopexids0, implnet_job_cagagespids0, implnet_job_sechydrgreg0, implnet_job_counties0, implnet_job_pws0, implnet_job_hu060, implnet_job_nataq0, implnet_job_cbsa0, implnet_job_hu080, implnet_job_hu040, implnet_job_princiaq0, implnet_job_refgage0, implnet_job_refgage3, implnet_job_refgage2, implnet_job_refgage1, implnet_job_dams0, implnet_job_dams1, implnet_job_ua100, implnet_job_states0, implnet_job_hu100, implnet_job_aiannh0, implnet_job_hu020, implnet_job_mainstems0, implnet_job_places0, implnet_job_hmw0, implnet_job_hmw1, implnet_job_huc12pp0, implnet_job_huc12pp1, implnet_job_nmwdiose3, implnet_job_nmwdiose2, implnet_job_nmwdiose0, implnet_job_nmwdiose4, implnet_job_nmwdiose1, implnet_job_nmwdist0, implnet_job_selfieids0, implnet_job_chyldpilotids0, implnet_job_rise0, implnet_job_autotest10, implnet_job_links0, implnet_job_demo0, implnet_job_autotest20, implnet_job_wade2, implnet_job_wade0, implnet_job_wade17, implnet_job_wade9, implnet_job_wade7, implnet_job_wade3, implnet_job_wade15, implnet_job_wade5, implnet_job_wade10, implnet_job_wade14, implnet_job_wade18, implnet_job_wade13, implnet_job_wade8, implnet_job_wade19, implnet_job_wade12, implnet_job_wade4, implnet_job_wade16, implnet_job_wade1, implnet_job_wade6, implnet_job_wade11]
- schedules = [implnet_sch_nwisgw20, implnet_sch_nwisgw22, implnet_sch_nwisgw16, implnet_sch_nwisgw12, implnet_sch_nwisgw25, implnet_sch_nwisgw14, implnet_sch_nwisgw23, implnet_sch_nwisgw10, implnet_sch_nwisgw15, implnet_sch_nwisgw2, implnet_sch_nwisgw24, implnet_sch_nwisgw9, implnet_sch_nwisgw19, implnet_sch_nwisgw28, implnet_sch_nwisgw26, implnet_sch_nwisgw5, implnet_sch_nwisgw13, implnet_sch_nwisgw6, implnet_sch_nwisgw3, implnet_sch_nwisgw4, implnet_sch_nwisgw1, implnet_sch_nwisgw21, implnet_sch_nwisgw27, implnet_sch_nwisgw8, implnet_sch_nwisgw17, implnet_sch_nwisgw18, implnet_sch_nwisgw7, implnet_sch_nwisgw11, implnet_sch_nwisgw0, implnet_sch_nwissite1, implnet_sch_nwissite3, implnet_sch_nwissite0, implnet_sch_nwissite2, implnet_sch_gfv11pois1, implnet_sch_gfv11pois0, implnet_sch_hydrologicunit0, implnet_sch_damspids0, implnet_sch_cuahsihishydrodataczhrids0, implnet_sch_cuahsihisnooksackmicroclimatenetworkids0, implnet_sch_cuahsihisneonids0, implnet_sch_cuahsihisglobalriversobservatoryids0, implnet_sch_cuahsihistncwaterdataids0, implnet_sch_cuahsihisscotlandnwisids0, implnet_sch_cuahsihisczoboulderids0, implnet_sch_cuahsihisyosemitehydroclimatenetworkids0, implnet_sch_cuahsihismuddyriverids0, implnet_sch_cuahsihisczomercedids0, implnet_sch_cuahsihisghcnids0, implnet_sch_cuahsihismmaatacamaids0, implnet_sch_cuahsihisumbcwqids0, implnet_sch_cuahsihisgleonlakeannieids0, implnet_sch_cuahsihisluwlids0, implnet_sch_cuahsihiscedarriverids0, implnet_sch_cuahsihisccbepdapids0, implnet_sch_cuahsihiskansasweatherdataids0, implnet_sch_cuahsihisodmkentstateids0, implnet_sch_cuahsihisgleondorsetids0, implnet_sch_cuahsihisclarksburgspids0, implnet_sch_cuahsihiscrwaids0, implnet_sch_cuahsihiscuisoids0, implnet_sch_cuahsihisprovorivergamutids0, implnet_sch_cuahsihisirwaids0, implnet_sch_cuahsihisczoluquilloids0, implnet_sch_cuahsihistuolumnemdwids0, implnet_sch_cuahsihisrmblids0, implnet_sch_cuahsihispanolaodmids0, implnet_sch_cuahsihisnewnids0, implnet_sch_cuahsihisczoudelids0, implnet_sch_cuahsihisfarmrwaids0, implnet_sch_cuahsihisskcmilltownids0, implnet_sch_cuahsihisumbcgwids0, implnet_sch_cuahsihisshalenetworkodmids0, implnet_sch_cuahsihisnevadosids0, implnet_sch_cuahsihisweiherbachids0, implnet_sch_cuahsihismazarriverprojectids0, implnet_sch_cuahsihisgleonsunapeeids0, implnet_sch_cuahsihisorsancohabids0, implnet_sch_cuahsihismwraids0, implnet_sch_cuahsihismaaeriids0, implnet_sch_cuahsihisnceiww2ids0, implnet_sch_cuahsihistarlandwaterqualityids0, implnet_sch_cuahsihislczoodm2ids0, implnet_sch_cuahsihiscocorahsids0, implnet_sch_cuahsihisparalanaturalezaids0, implnet_sch_cuahsihisczocatalinaids0, implnet_sch_cuahsihisieeratwilkesuniversityids0, implnet_sch_cuahsihismudlakeids0, implnet_sch_cuahsihismwdisids0, implnet_sch_cuahsihisloganriverids0, implnet_sch_cuahsihisscanids0, implnet_sch_cuahsihisnashrwaids0, implnet_sch_cuahsihismobilecrowdhydrologyids0, implnet_sch_cuahsihisandrewsforestlterids0, implnet_sch_cuahsihisloganrivergamutids0, implnet_sch_cuahsihislittlebearriverids0, implnet_sch_cuahsihislterntlwoodruffids0, implnet_sch_cuahsihissagehencreekids0, implnet_sch_cuahsihisshalenetworkodmids1, implnet_sch_cuahsihisfrcwqmids0, implnet_sch_cuahsihishydrodataczdids0, implnet_sch_cuahsihisdrwiids0, implnet_sch_cuahsihisubwpadids0, implnet_sch_cuahsihistrwaids0, implnet_sch_cuahsihisredbuttecreekgamutids0, implnet_sch_cuahsihisglacialridgeids0, implnet_sch_cuahsihisfcelterids0, implnet_sch_cuahsihisczoarizids0, implnet_sch_cuahsihiscalvinhhsids0, implnet_sch_cuahsihissnotelids0, implnet_sch_cuahsihisnevcanids0, implnet_sch_cuahsihisczopsuids0, implnet_sch_cuahsihisbrazilucbids0, implnet_sch_cuahsihisgleonauburnids0, implnet_sch_cuahsihislaselvastreamdischargeids0, implnet_sch_cuahsihisisbenaids0, implnet_sch_cuahsihisswedishmonitoringdataids0, implnet_sch_cuahsihisunhsnowids0, implnet_sch_cuahsihishassbergeids0, implnet_sch_cuahsihisnhgswofids0, implnet_sch_cuahsihisgonggaids0, implnet_sch_cuahsihismopexids0, implnet_sch_cagagespids0, implnet_sch_sechydrgreg0, implnet_sch_counties0, implnet_sch_pws0, implnet_sch_hu060, implnet_sch_nataq0, implnet_sch_cbsa0, implnet_sch_hu080, implnet_sch_hu040, implnet_sch_princiaq0, implnet_sch_refgage0, implnet_sch_refgage3, implnet_sch_refgage2, implnet_sch_refgage1, implnet_sch_dams0, implnet_sch_dams1, implnet_sch_ua100, implnet_sch_states0, implnet_sch_hu100, implnet_sch_aiannh0, implnet_sch_hu020, implnet_sch_mainstems0, implnet_sch_places0, implnet_sch_hmw0, implnet_sch_hmw1, implnet_sch_huc12pp0, implnet_sch_huc12pp1, implnet_sch_nmwdiose3, implnet_sch_nmwdiose2, implnet_sch_nmwdiose0, implnet_sch_nmwdiose4, implnet_sch_nmwdiose1, implnet_sch_nmwdist0, implnet_sch_selfieids0, implnet_sch_chyldpilotids0, implnet_sch_rise0, implnet_sch_autotest10, implnet_sch_links0, implnet_sch_demo0, implnet_sch_autotest20, implnet_sch_wade2, implnet_sch_wade0, implnet_sch_wade17, implnet_sch_wade9, implnet_sch_wade7, implnet_sch_wade3, implnet_sch_wade15, implnet_sch_wade5, implnet_sch_wade10, implnet_sch_wade14, implnet_sch_wade18, implnet_sch_wade13, implnet_sch_wade8, implnet_sch_wade19, implnet_sch_wade12, implnet_sch_wade4, implnet_sch_wade16, implnet_sch_wade1, implnet_sch_wade6, implnet_sch_wade11]
-
-
- return jobs + schedules
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py
deleted file mode 100644
index 833bcffe..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_africaioc import implnet_job_africaioc
-
-@schedule(cron_schedule="0 0 * * 0", job=implnet_job_africaioc, execution_timezone="US/Central")
-def implnet_sch_africaioc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py
deleted file mode 100644
index 7ffe1dc9..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs
-
-@schedule(cron_schedule="0 6 * * 0", job=implnet_job_aquadocs, execution_timezone="US/Central")
-def implnet_sch_aquadocs(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py
deleted file mode 100644
index 951c20fc..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_benguelacc import implnet_job_benguelacc
-
-@schedule(cron_schedule="0 12 * * 0", job=implnet_job_benguelacc, execution_timezone="US/Central")
-def implnet_sch_benguelacc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py
deleted file mode 100644
index 74a73d41..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_caribbeanmarineatlas import implnet_job_caribbeanmarineatlas
-
-@schedule(cron_schedule="0 18 * * 0", job=implnet_job_caribbeanmarineatlas, execution_timezone="US/Central")
-def implnet_sch_caribbeanmarineatlas(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py
deleted file mode 100644
index bd0c283e..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_cioos import implnet_job_cioos
-
-@schedule(cron_schedule="0 0 * * 1", job=implnet_job_cioos, execution_timezone="US/Central")
-def implnet_sch_cioos(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py
deleted file mode 100644
index c0d3ccc8..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_edmerp import implnet_job_edmerp
-
-@schedule(cron_schedule="0 6 * * 1", job=implnet_job_edmerp, execution_timezone="US/Central")
-def implnet_sch_edmerp(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py
deleted file mode 100644
index 9082898e..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_edmo import implnet_job_edmo
-
-@schedule(cron_schedule="0 12 * * 1", job=implnet_job_edmo, execution_timezone="US/Central")
-def implnet_sch_edmo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py
deleted file mode 100644
index 3f4352d8..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_emodnet import implnet_job_emodnet
-
-@schedule(cron_schedule="0 12 * * 3", job=implnet_job_emodnet, execution_timezone="US/Central")
-def implnet_sch_emodnet(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py
deleted file mode 100644
index 45642ea0..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceanevents import implnet_job_euroceanevents
-
-@schedule(cron_schedule="0 18 * * 1", job=implnet_job_euroceanevents, execution_timezone="US/Central")
-def implnet_sch_euroceanevents(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py
deleted file mode 100644
index 61dfa79a..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceanexperts import implnet_job_euroceanexperts
-
-@schedule(cron_schedule="0 0 * * 2", job=implnet_job_euroceanexperts, execution_timezone="US/Central")
-def implnet_sch_euroceanexperts(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py
deleted file mode 100644
index 79a2d707..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceaninstitutions import implnet_job_euroceaninstitutions
-
-@schedule(cron_schedule="0 6 * * 2", job=implnet_job_euroceaninstitutions, execution_timezone="US/Central")
-def implnet_sch_euroceaninstitutions(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py
deleted file mode 100644
index 07a5fad9..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceanorgs import implnet_job_euroceanorgs
-
-@schedule(cron_schedule="0 12 * * 2", job=implnet_job_euroceanorgs, execution_timezone="US/Central")
-def implnet_sch_euroceanorgs(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py
deleted file mode 100644
index fc55bf23..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceanprojects import implnet_job_euroceanprojects
-
-@schedule(cron_schedule="0 18 * * 2", job=implnet_job_euroceanprojects, execution_timezone="US/Central")
-def implnet_sch_euroceanprojects(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py
deleted file mode 100644
index 739929ce..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceantraining import implnet_job_euroceantraining
-
-@schedule(cron_schedule="0 0 * * 3", job=implnet_job_euroceantraining, execution_timezone="US/Central")
-def implnet_sch_euroceantraining(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py
deleted file mode 100644
index a72e6218..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_euroceanvessels import implnet_job_euroceanvessels
-
-@schedule(cron_schedule="0 6 * * 3", job=implnet_job_euroceanvessels, execution_timezone="US/Central")
-def implnet_sch_euroceanvessels(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py
deleted file mode 100644
index cf85b20f..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_inanodc import implnet_job_inanodc
-
-@schedule(cron_schedule="0 18 * * 3", job=implnet_job_inanodc, execution_timezone="US/Central")
-def implnet_sch_inanodc(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py
deleted file mode 100644
index 03fc837a..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_invemardocuments import implnet_job_invemardocuments
-
-@schedule(cron_schedule="0 0 * * 4", job=implnet_job_invemardocuments, execution_timezone="US/Central")
-def implnet_sch_invemardocuments(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py
deleted file mode 100644
index 08f65c09..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_invemarexperts import implnet_job_invemarexperts
-
-@schedule(cron_schedule="0 6 * * 4", job=implnet_job_invemarexperts, execution_timezone="US/Central")
-def implnet_sch_invemarexperts(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py
deleted file mode 100644
index 43070d73..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_invemarinstitutions import implnet_job_invemarinstitutions
-
-@schedule(cron_schedule="0 12 * * 4", job=implnet_job_invemarinstitutions, execution_timezone="US/Central")
-def implnet_sch_invemarinstitutions(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py
deleted file mode 100644
index 24ef2740..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_invemartraining import implnet_job_invemartraining
-
-@schedule(cron_schedule="0 18 * * 4", job=implnet_job_invemartraining, execution_timezone="US/Central")
-def implnet_sch_invemartraining(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py
deleted file mode 100644
index 6baaab09..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_invemarvessels import implnet_job_invemarvessels
-
-@schedule(cron_schedule="0 0 * * 5", job=implnet_job_invemarvessels, execution_timezone="US/Central")
-def implnet_sch_invemarvessels(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py
deleted file mode 100644
index ae234b6f..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_marinetraining import implnet_job_marinetraining
-
-@schedule(cron_schedule="0 6 * * 5", job=implnet_job_marinetraining, execution_timezone="US/Central")
-def implnet_sch_marinetraining(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py
deleted file mode 100644
index b6f09ed9..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_maspawio import implnet_job_maspawio
-
-@schedule(cron_schedule="0 12 * * 5", job=implnet_job_maspawio, execution_timezone="US/Central")
-def implnet_sch_maspawio(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py
deleted file mode 100644
index b8116bee..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_obis import implnet_job_obis
-
-@schedule(cron_schedule="0 18 * * 5", job=implnet_job_obis, execution_timezone="US/Central")
-def implnet_sch_obis(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py
deleted file mode 100644
index 8b3ff4f5..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_obps import implnet_job_obps
-
-@schedule(cron_schedule="0 0 * * 6", job=implnet_job_obps, execution_timezone="US/Central")
-def implnet_sch_obps(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py
deleted file mode 100644
index db84bdfb..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_oceanexperts import implnet_job_oceanexperts
-
-@schedule(cron_schedule="0 6 * * 6", job=implnet_job_oceanexperts, execution_timezone="US/Central")
-def implnet_sch_oceanexperts(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py
deleted file mode 100644
index 4d2438bf..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_oceanscape import implnet_job_oceanscape
-
-@schedule(cron_schedule="0 12 * * 6", job=implnet_job_oceanscape, execution_timezone="US/Central")
-def implnet_sch_oceanscape(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py
deleted file mode 100644
index 9920619d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_pdh import implnet_job_pdh
-
-@schedule(cron_schedule="0 18 * * 6", job=implnet_job_pdh, execution_timezone="US/Central")
-def implnet_sch_pdh(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py
deleted file mode 100644
index d499c042..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_pogo import implnet_job_pogo
-
-@schedule(cron_schedule="0 0 * * 0", job=implnet_job_pogo, execution_timezone="US/Central")
-def implnet_sch_pogo(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml
deleted file mode 100644
index 54490e1d..00000000
--- a/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-load_from:
- - python_file:
- relative_path: "repositories/repository.py"
- working_directory: .
\ No newline at end of file
diff --git a/dagster/implnets/pygen.py b/dagster/implnets/pygen.py
index ae8bc80a..fa61442c 100644
--- a/dagster/implnets/pygen.py
+++ b/dagster/implnets/pygen.py
@@ -6,7 +6,7 @@
import fileinput
import re
import shutil
-
+import pydash
# python pygen.py -cf ../../configs/oih/gleanerconfig.yaml -od ./output -td ./templates
@@ -24,8 +24,9 @@ def gencode(cf, od, td, days) -> str:
inc = round(hours / len(c["sources"])) # divide hours we want to run over by number of source to get increment
print("index event every {} hours over {} day(s) period for {} items".format(inc, days, len(c["sources"])))
-
- for i, s in enumerate(c["sources"]):
+ sources = pydash.union_by(c["sources"], lambda source: source["name"])
+ #for i, s in enumerate(c["sources"]):
+ for i, s in enumerate(sources):
# could put an if statement here for those that are active
# print(s["name"])
diff --git a/dagster/implnets/requirements.txt b/dagster/implnets/requirements.txt
index e7a319c9..bf6e8ff1 100644
--- a/dagster/implnets/requirements.txt
+++ b/dagster/implnets/requirements.txt
@@ -1,17 +1,25 @@
-dagit>=1.4.2
-dagster-postgres==0.20.2
-dagster>=1.4.2
-dagster-webserver>=1.4.2
-dagster-docker
-dagster-aws
+dagit>=1.7.10
+dagster-postgres>=0.23.10
+dagster>=1.7.10
+dagster-graphql>=1.7.10
+dagster-webserver>=1.7.10
+dagster-docker>=0.23.10
+dagster-aws>=0.23.10
+dagster_slack>=0.23.10
ipython-genutils==0.2.0
advertools==0.13.2
minio==7.1.13
docker>=6.1.0
+dagstermill>=0.23.10
+notebook
+pydash
+pyyaml
+orjson
+
+#earthcube-utilities>=0.1.26
+earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@dev#egg=earthcube_utilities&subdirectory=earthcube_utilities
-earthcube-utilities>=0.1.18
-#earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@b671efb#subdirectory=earthcube_utilities
# if we want to use an non-released branch 2c1dcab is the commit
# earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@2c1dcab#subdirectory=earthcube_utilities
diff --git a/dagster/implnets/requirements_code.txt b/dagster/implnets/requirements_code.txt
deleted file mode 100644
index 445ea1e9..00000000
--- a/dagster/implnets/requirements_code.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-
-# this includes files that are used in the code server,
-# eg requirements of a code run on dagster
-# needs the dagster code for the dagster api grpc call,
-# but do not think we need the webserver/dagit
-#dagit>=1.4.2
-dagster-postgres>=0.20.2
-dagster>=1.4.2
-#dagster-webserver>=1.4.2
-dagster-docker
-dagster-aws
-ipython-genutils==0.2.0
-advertools==0.13.2
-minio==7.1.13
-docker>=6.1.0
-
-earthcube-utilities>=0.1.18
-#earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@b671efb#subdirectory=earthcube_utilities
-# if we want to use an non-released branch 2c1dcab is the commit
-# earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@2c1dcab#subdirectory=earthcube_utilities
-
-
diff --git a/dagster/implnets/requirements_dagster.txt b/dagster/implnets/requirements_dagster.txt
index 1629762e..e21687a5 100644
--- a/dagster/implnets/requirements_dagster.txt
+++ b/dagster/implnets/requirements_dagster.txt
@@ -1,10 +1,11 @@
-
-dagit>=1.4.2
-dagster-postgres>=0.20.2
-dagster>=1.4.2
-dagster-webserver>=1.4.2
-dagster-docker
-dagster-aws
+dagit>=1.7.7
+dagster-postgres>=0.23.7
+dagster>=1.7.7
+dagster-webserver>=1.7.7
+dagster-docker>=0.23.7
+dagster-aws>=0.23.7
+dagstermill>=0.23.7
+dagster_slack>=0.23.7
###
# this are modules that are not part of dagster base image
# ipython-genutils==0.2.0
@@ -12,7 +13,7 @@ dagster-aws
# minio==7.1.13
# docker>=6.1.0
#
-# earthcube-utilities>=0.1.17
+# earthcube-utilities>=0.1.25
diff --git a/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py b/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py
index 9618feef..d4a335e9 100644
--- a/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py
+++ b/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py
@@ -1,7 +1,9 @@
+import csv
import distutils
import logging
import time
+import pandas
from dagster import job, op, graph,In, Nothing, get_dagster_logger
import os, json, io
import urllib
@@ -12,10 +14,12 @@
from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
import json
+from ec.graph.release_graph import ReleaseGraph
from minio import Minio
from minio.error import S3Error
from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
+from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo, \
+ generateGraphReportsRelease
from ec.datastore import s3
from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
from ec.graph.manageGraph import ManageBlazegraph as mg
@@ -36,19 +40,19 @@
from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
+DEBUG=(os.getenv('DEBUG_CONTAINER', 'False').lower() == 'true')
# #
# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
# WHEN RUNNING dagster-dev, this needs to be a path to a local file
##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
+DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('GLEANERIO_DAGSTER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
+GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_DOCKER_HEADLESS_NETWORK', "headless_gleanerio")
# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
+URL = os.environ.get('GLEANERIO_DOCKER_URL')
+APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY')
+CONTAINER_WAIT_TIMEOUT= int( os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',300))
GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
@@ -77,10 +81,11 @@
GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+GLEANERIO_DOCKER_GLEANER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner'))
+GLEANERIO_DOCKER_NABU_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu'))
#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+GLEANERIO_SUMMARIZE_GRAPH=(os.getenv('GLEANERIO_GRAPH_SUMMARIZE', 'False').lower() == 'true')
SUMMARY_PATH = 'graphs/summary'
RELEASE_PATH = 'graphs/latest'
@@ -138,7 +143,7 @@ def s3reader(object):
get_dagster_logger().info(f"S3 read error : {str(err)}")
-def s3loader(data, name):
+def s3_log_uploader(data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")):
secure= GLEANER_MINIO_USE_SSL
server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
@@ -158,8 +163,8 @@ def s3loader(data, name):
# else:
# print("Bucket 'X' already exists")
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
+ # now = datetime.now()
+ # date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
logname = name + '_{}.log'.format(date_string)
objPrefix = GLEANERIO_LOG_PREFIX + logname
@@ -167,24 +172,38 @@ def s3loader(data, name):
#length = f.write(bytes(json_str, 'utf-8'))
length = f.write(data)
f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
+ try:
+ client.put_object(GLEANER_MINIO_BUCKET,
+ objPrefix,
+ f, #io.BytesIO(data),
+ length, #len(data),
+ content_type="text/plain"
+ )
+ get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
+ except Exception as ex:
+ get_dagster_logger().error(f"Log uploaded failed: {str(objPrefix)}")
+
+def _releaseUrl( source, path=RELEASE_PATH, extension="nq"):
proto = "http"
-
if GLEANER_MINIO_USE_SSL:
proto = "https"
- port = GLEANER_MINIO_PORT
address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
bucket = GLEANER_MINIO_BUCKET
release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
+ return release_url
+
+def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()):
+ # revision of EC utilities, will have a insertFromURL
+ #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
+ # proto = "http"
+ #
+ # if GLEANER_MINIO_USE_SSL:
+ # proto = "https"
+ # port = GLEANER_MINIO_PORT
+ # address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
+ # bucket = GLEANER_MINIO_BUCKET
+ # release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
+
# BLAZEGRAPH SPECIFIC
# url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
# get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
@@ -201,6 +220,7 @@ def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_grap
# get_dagster_logger().info(f'graph: error')
# raise Exception(f' graph: insert failed: status:{r.status_code}')
+ release_url = _releaseUrl(source, path, extension)
### GENERIC LOAD FROM
url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
@@ -263,13 +283,13 @@ def _create_service(
serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1)
get_dagster_logger().info(str(client.configs.list()))
# gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
- gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]})
+ gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_DOCKER_GLEANER_CONFIG]})
get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
- nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]})
+ nabuconfig = client.configs.list(filters={"name":[GLEANERIO_DOCKER_NABU_CONFIG]})
get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
get_dagster_logger().info(f"create docker service for {name}")
- gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
- nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
+ gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_DOCKER_GLEANER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH)
+ nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_DOCKER_NABU_CONFIG,GLEANERIO_NABU_CONFIG_PATH)
configs = [gleaner,nabu]
# name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
service = client.services.create(
@@ -307,7 +327,7 @@ def gleanerio(context, mode, source):
## ------------ Create
returnCode = 0
get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
+ date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
if str(mode) == "gleaner":
IMAGE =GLEANERIO_GLEANER_IMAGE
@@ -424,13 +444,8 @@ def gleanerio(context, mode, source):
data["Env"] = enva
data["HostConfig"] = {
"NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
}
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
+
# docker dagster
get_dagster_logger().info(f"start docker code region: ")
@@ -468,102 +483,74 @@ def gleanerio(context, mode, source):
cid = container.id # legacy til the start get's fixed
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
+# Removed watching the logs, in favor of periodic upload
+ wait_count = 0
+ while True:
+ wait_count += 1
+ try:
+ container.wait(timeout=CONTAINER_WAIT_TIMEOUT)
+ exit_status = container.wait()["StatusCode"]
+ get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
+ # WE PULL THE LOGS, then will throw an error
+ returnCode = exit_status
+ c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
+
+ # write to s3
+
+ s3_log_uploader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+# this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+ except requests.exceptions.ReadTimeout as ex:
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}")
+ except docker.errors.APIError as ex:
+ get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}")
+ returnCode = 1
+ break
+ if container.status == 'exited' or container.status == 'removed':
+ get_dagster_logger().info(f"Container exited or removed. status: {container.status}")
+ exit_status = container.wait()["StatusCode"]
+ returnCode = exit_status
+ s3_log_uploader(str(c).encode(), NAME) # s3loader needs a bytes like object
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+ # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+
+ # ABOVE Future, need to extraxct files, and upload
# pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
# pw_tar.extractall("extract_to/")
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
if exit_status != 0:
raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
finally:
if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
if (service):
service.remove()
get_dagster_logger().info(f"Service Remove: {service.name}")
@@ -572,14 +559,7 @@ def gleanerio(context, mode, source):
else:
get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
+
if (returnCode != 0):
get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
@@ -685,14 +665,15 @@ def SOURCEVAL_graph_reports(context) :
graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
+
+ #returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
+ s3FileUrl = _releaseUrl(source_name )
+ returned_value = generateGraphReportsRelease(source_name,s3FileUrl)
r = str('returned value:{}'.format(returned_value))
#report = json.dumps(returned_value, indent=2) # value already json.dumps
report = returned_value
s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
- get_dagster_logger().info(f"graph report returned {r} ")
+ get_dagster_logger().info(f"graph stats returned {r} ")
return
@op(ins={"start": In(Nothing)})
@@ -718,8 +699,8 @@ def SOURCEVAL_bucket_urls(context):
res = s3Minio.listSummonedUrls(bucket, source_name)
r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
+ bucketurls = pandas.DataFrame(res).to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC)
+ s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.csv", bucketurls)
get_dagster_logger().info(f"bucker urls report returned {r} ")
return
@@ -738,7 +719,13 @@ def SOURCEVAL_summarize(context) :
try:
- summarydf = get_summary4repoSubset(endpoint, source_name)
+ # summarydf = get_summary4repoSubset(endpoint, source_name)
+ rg = ReleaseGraph()
+ rg.read_release(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT),
+ bucket,
+ source_name,
+ options=MINIO_OPTIONS)
+ summarydf = rg.summarize()
nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
summaryttl = g.serialize(format='longturtle')
# Lets always write out file to s3, and insert as a separate process
@@ -791,10 +778,11 @@ def harvest_SOURCEVAL():
# defingin nothing dependencies
# https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
- report_ms3 = SOURCEVAL_missingreport_s3(start=harvest)
+ report_bucketurl = SOURCEVAL_bucket_urls(start=harvest)
+ report_ms3 = SOURCEVAL_missingreport_s3(start=report_bucketurl)
report_idstat = SOURCEVAL_identifier_stats(start=report_ms3)
# for some reason, this causes a msg parameter missing
- report_bucketurl = SOURCEVAL_bucket_urls(start=report_idstat)
+
#report1 = missingreport_s3(harvest, source="SOURCEVAL")
load_release = SOURCEVAL_naburelease(start=harvest)
@@ -804,11 +792,14 @@ def harvest_SOURCEVAL():
load_prov = SOURCEVAL_nabuprov(start=load_prune)
load_org = SOURCEVAL_nabuorg(start=load_prov)
- summarize = SOURCEVAL_summarize(start=load_uploadrelease)
- upload_summarize = SOURCEVAL_upload_summarize(start=summarize)
+ if(GLEANERIO_SUMMARIZE_GRAPH):
+ summarize = SOURCEVAL_summarize(start=load_uploadrelease)
+ upload_summarize = SOURCEVAL_upload_summarize(start=summarize)
+
+
# run after load
- report_msgraph = SOURCEVAL_missingreport_graph(start=summarize)
+ report_msgraph = SOURCEVAL_missingreport_graph(start=load_prov)
report_graph = SOURCEVAL_graph_reports(start=report_msgraph)
diff --git a/dagster/implnets/workflows/__init__.py b/dagster/implnets/workflows/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/workflows/ecrr/NOTE_ECRR.md b/dagster/implnets/workflows/ecrr/NOTE_ECRR.md
index 4676f75b..6fafd1ae 100644
--- a/dagster/implnets/workflows/ecrr/NOTE_ECRR.md
+++ b/dagster/implnets/workflows/ecrr/NOTE_ECRR.md
@@ -1,12 +1,24 @@
+TESTING UPLOADING WITH LOAD:
+LOAD
+LOAD
+
ECRR from google drive will require a manual harvest, and manual configuration.
+ECRR_SUMBITTED IS THE REPO TO LOAD.
+
+ECRR_EXAMPLES is a sitemap from the GecodesMetadata repository
+
+It will not need to be summoned. It will be rsyc'd from the old goodge drive for now, and
+later, it will need to read from the s3 bucket where the files are stored by the JSONFORMS app.
+
You need to generate the code, and modify the deployed config files in s3.
-pygen.py -cf ./configs/ecrr/gleanerconfig.yaml -od ./repositories/ecrr -td ./templates/v1 -d 7
+pygen.py -cf ./configs/ecrr/gleanerconfig.yaml -od ./workflows/generated/ecrr -td ./templates/v1 -d 7
+
+Then modify the output for the ops files and put into the ecrr folder.
-Then modify the output for the ops files
GLEANER_MINIO_BUCKET = os.environ.get('ECRR_MINIO_BUCKET')
GLEANER_GRAPH_NAMESPACE = os.environ.get('ECRR_GRAPH_NAMESPACE')
@@ -17,4 +29,13 @@ Remove the gleaner, missing reporting, identifer, bucket url steps...
summarize steps.
pass some string to the first nabu step
+# RUNNING LOCALLY
+* You need to point at a docker STACK, or portainer endpoint... A local workstation docker is usually not a STACK.
+* set the ENV variables; ECRR_MINIO_BUCKET ECRR_GRAPH_NAMESPACE
+*
+`cd workflows/ecrr/ecrr
+python -m dagster dev `
+To run a job:
+`cd workflows/ecrr/ecrr
+python -m dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_job_ecrr_examples`
diff --git a/dagster/implnets/workflows/ecrr/ecrr/__init__.py b/dagster/implnets/workflows/ecrr/ecrr/__init__.py
new file mode 100644
index 00000000..1929c4ef
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/ecrr/__init__.py
@@ -0,0 +1,20 @@
+from dagster import repository, Definitions
+import os
+from .jobs.implnet_jobs_ecrr_submitted import job_ecrr_submitted
+from .sch.implnet_sch_ecrr_submitted import implnet_sch_ecrr_submitted
+from .jobs.implnet_jobs_ecrr_examples import job_ecrr_examples
+from .sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples
+
+from dagster_slack import SlackResource, make_slack_on_run_failure_sensor
+slack_on_run_failure = make_slack_on_run_failure_sensor(
+ os.getenv("SLACK_CHANNEL"),
+ os.getenv("SLACK_TOKEN")
+)
+jobs = [ job_ecrr_submitted, job_ecrr_examples]
+schedules = [ implnet_sch_ecrr_submitted, implnet_sch_ecrr_examples]
+
+defs = Definitions(
+ jobs=jobs,
+ schedules=schedules,
+ sensors=[slack_on_run_failure]
+)
diff --git a/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py
new file mode 100644
index 00000000..6370385a
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py
@@ -0,0 +1,7 @@
+from dagster import job
+
+from ..ops.implnet_ops_ecrr_examples import reload_ecrr_examples
+
+@job
+def job_ecrr_examples():
+ reload_ecrr_examples()
diff --git a/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py
new file mode 100644
index 00000000..8a099aee
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py
@@ -0,0 +1,7 @@
+from dagster import job
+
+from ..ops.implnet_ops_ecrr_submitted import reload_ecrr_submitted
+
+@job
+def job_ecrr_submitted():
+ reload_ecrr_submitted()
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py
similarity index 95%
rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py
rename to dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py
index c79c52c5..7ca4f370 100644
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py
+++ b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py
@@ -1,4 +1,4 @@
-import distutils
+from distutils import util
import logging
import time
@@ -46,16 +46,16 @@
# Vars and Envs
GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
+URL = os.environ.get('GLEANERIO_DOCKER_URL')
+APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY')
GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
+GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+GLEANER_MINIO_BUCKET =str( os.environ.get('ECRR_MINIO_BUCKET'))
# set for the earhtcube utiltiies
MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
@@ -67,7 +67,7 @@
GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
# using GLEANER, even though this is a nabu property... same prefix seems easier
GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+GLEANER_GRAPH_NAMESPACE = str(os.environ.get('ECRR_GRAPH_NAMESPACE'))
GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
@@ -77,8 +77,8 @@
GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner'))
+GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu'))
#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
@@ -784,32 +784,13 @@ def ecrr_examples_upload_summarize(context):
# r = str('returned value:{}'.format(returned_value))
# return msg + r
@graph
-def harvest_ecrr_examples():
+def reload_ecrr_examples():
containers = ecrr_examples_getImage()
harvest = ecrr_examples_gleaner(start=containers)
-
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = ecrr_examples_missingreport_s3(start=harvest)
- report_idstat = ecrr_examples_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = ecrr_examples_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="ecrr_examples")
load_release = ecrr_examples_naburelease(start=harvest)
load_uploadrelease = ecrr_examples_uploadrelease(start=load_release)
+ # report_graph = ecrr_examples_graph_reports(start=load_uploadrelease)
- load_prune = ecrr_examples_nabu_prune(start=load_uploadrelease)
- load_prov = ecrr_examples_nabuprov(start=load_prune)
- load_org = ecrr_examples_nabuorg(start=load_prov)
-
- summarize = ecrr_examples_summarize(start=load_uploadrelease)
- upload_summarize = ecrr_examples_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = ecrr_examples_missingreport_graph(start=summarize)
- report_graph = ecrr_examples_graph_reports(start=report_msgraph)
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py
similarity index 90%
rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py
rename to dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py
index 4f25ac57..a36c6ee9 100644
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py
+++ b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py
@@ -1,4 +1,4 @@
-import distutils
+from distutils import util
import logging
import time
@@ -36,26 +36,26 @@
from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
+DEBUG=(os.getenv('DEBUG_CONTAINER', 'False').lower() == 'true')
# #
# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted)
# WHEN RUNNING dagster-dev, this needs to be a path to a local file
##
-DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
+DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('GLEANERIO_DAGSTER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
+GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_DOCKER_HEADLESS_NETWORK', "headless_gleanerio")
# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
+URL = os.environ.get('GLEANERIO_DOCKER_URL')
+APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY')
GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
-GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
+GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
-GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+GLEANER_MINIO_BUCKET =str( os.environ.get('ECRR_MINIO_BUCKET'))
# set for the earhtcube utiltiies
MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
@@ -67,7 +67,7 @@
GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
# using GLEANER, even though this is a nabu property... same prefix seems easier
GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
-GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+GLEANER_GRAPH_NAMESPACE = str(os.environ.get('ECRR_GRAPH_NAMESPACE'))
GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
@@ -77,10 +77,10 @@
GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
-GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
-GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner'))
+GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu'))
#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
-GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_ENDPOINT',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
SUMMARY_PATH = 'graphs/summary'
RELEASE_PATH = 'graphs/latest'
@@ -206,7 +206,7 @@ def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_grap
get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
loadfrom = {'update': f'LOAD <{release_url}>'}
headers = {
- 'Content-Type': 'application/x-www-form-urlencoded'
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
r = requests.post(url, headers=headers, data=loadfrom )
log.debug(f' status:{r.status_code}') # status:404
@@ -587,7 +587,7 @@ def gleanerio(context, mode, source):
return returnCode
@op
-def edi_getImage(context):
+def ecrr_submitted_getImage(context):
run_container_context = DockerContainerContext.create_for_run(
context.dagster_run,
context.instance.run_launcher
@@ -599,54 +599,54 @@ def edi_getImage(context):
client.images.pull(GLEANERIO_GLEANER_IMAGE)
client.images.pull(GLEANERIO_NABU_IMAGE)
@op(ins={"start": In(Nothing)})
-def edi_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "edi")
+def ecrr_submitted_gleaner(context):
+ returned_value = gleanerio(context, ("gleaner"), "ecrr_submitted")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"Gleaner returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "edi")
+def ecrr_submitted_nabu_prune(context):
+ returned_value = gleanerio(context,("prune"), "ecrr_submitted")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu prune returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "edi")
+def ecrr_submitted_nabuprov(context):
+ returned_value = gleanerio(context,("prov"), "ecrr_submitted")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu prov returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "edi")
+def ecrr_submitted_nabuorg(context):
+ returned_value = gleanerio(context,("orgs"), "ecrr_submitted")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu org load returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_naburelease(context):
- returned_value = gleanerio(context,("release"), "edi")
+def ecrr_submitted_naburelease(context):
+ returned_value = gleanerio(context,("release"), "ecrr_submitted")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu release returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_uploadrelease(context):
- returned_value = post_to_graph("edi", extension="nq")
+def ecrr_submitted_uploadrelease(context):
+ returned_value = post_to_graph("ecrr_submitted", extension="nq")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"upload release returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi")
+def ecrr_submitted_missingreport_s3(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted")
source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
graphendpoint = None
milled = False
summon = True
@@ -657,12 +657,12 @@ def edi_missingreport_s3(context):
get_dagster_logger().info(f"missing s3 report returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi")
+def ecrr_submitted_missingreport_graph(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted")
source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
@@ -676,12 +676,12 @@ def edi_missingreport_graph(context):
get_dagster_logger().info(f"missing graph report returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def edi_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi")
+def ecrr_submitted_graph_reports(context) :
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted")
#source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
@@ -696,11 +696,11 @@ def edi_graph_reports(context) :
return
@op(ins={"start": In(Nothing)})
-def edi_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi")
+def ecrr_submitted_identifier_stats(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted")
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
r = str('returned value:{}'.format(returned_value))
@@ -711,10 +711,10 @@ def edi_identifier_stats(context):
return
@op(ins={"start": In(Nothing)})
-def edi_bucket_urls(context):
+def ecrr_submitted_bucket_urls(context):
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
res = s3Minio.listSummonedUrls(bucket, source_name)
r = str('returned value:{}'.format(res))
@@ -728,10 +728,10 @@ class S3ObjectInfo:
object_name=""
@op(ins={"start": In(Nothing)})
-def edi_summarize(context) :
+def ecrr_submitted_summarize(context) :
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "edi"
+ source_name = "ecrr_submitted"
endpoint = _graphEndpoint() # getting data, not uploading data
summary_namespace = _graphSummaryEndpoint()
@@ -762,20 +762,20 @@ def edi_summarize(context) :
return
@op(ins={"start": In(Nothing)})
-def edi_upload_summarize(context):
- returned_value = post_to_graph("edi",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
+def ecrr_submitted_upload_summarize(context):
+ returned_value = post_to_graph("ecrr_submitted",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"upload summary returned {r} ")
return
#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="edi"):
+# def missingreport_s3(context, msg: str, source="ecrr_submitted"):
#
# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
# source_url = source.get('url')
# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
# bucket = GLEANER_MINIO_BUCKET
-# source_name="edi"
+# source_name="ecrr_submitted"
#
# graphendpoint = None
# milled = False
@@ -784,32 +784,14 @@ def edi_upload_summarize(context):
# r = str('returned value:{}'.format(returned_value))
# return msg + r
@graph
-def harvest_edi():
- containers = edi_getImage()
- harvest = edi_gleaner(start=containers)
+def reload_ecrr_submitted():
+ containers = ecrr_submitted_getImage()
+ load_release = ecrr_submitted_naburelease(start=containers)
+ load_uploadrelease = ecrr_submitted_uploadrelease(start=load_release)
+ #report_graph = ecrr_submitted_graph_reports(start=load_uploadrelease)
+ # harvest = ecrr_submitted_gleaner(start=containers)
-# defingin nothing dependencies
- # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
-
- report_ms3 = edi_missingreport_s3(start=harvest)
- report_idstat = edi_identifier_stats(start=report_ms3)
- # for some reason, this causes a msg parameter missing
- report_bucketurl = edi_bucket_urls(start=report_idstat)
-
- #report1 = missingreport_s3(harvest, source="edi")
- load_release = edi_naburelease(start=harvest)
- load_uploadrelease = edi_uploadrelease(start=load_release)
-
- load_prune = edi_nabu_prune(start=load_uploadrelease)
- load_prov = edi_nabuprov(start=load_prune)
- load_org = edi_nabuorg(start=load_prov)
-
- summarize = edi_summarize(start=load_uploadrelease)
- upload_summarize = edi_upload_summarize(start=summarize)
-
-# run after load
- report_msgraph = edi_missingreport_graph(start=summarize)
- report_graph = edi_graph_reports(start=report_msgraph)
+#
diff --git a/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py
new file mode 100644
index 00000000..8f56fc9e
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py
@@ -0,0 +1,8 @@
+from dagster import schedule
+
+from ..jobs.implnet_jobs_ecrr_examples import job_ecrr_examples
+
+@schedule(cron_schedule="0 16 5 * *", job=job_ecrr_examples, execution_timezone="US/Central")
+def implnet_sch_ecrr_examples(_context):
+ run_config = {}
+ return run_config
diff --git a/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py
new file mode 100644
index 00000000..5b2ed643
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py
@@ -0,0 +1,8 @@
+from dagster import schedule
+
+from ..jobs.implnet_jobs_ecrr_submitted import job_ecrr_submitted
+
+@schedule(cron_schedule="0 8 3 * *", job=job_ecrr_submitted, execution_timezone="US/Central")
+def implnet_sch_ecrr_submitted(_context):
+ run_config = {}
+ return run_config
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/workspace.yaml b/dagster/implnets/workflows/ecrr/ecrr/workspace.yaml
similarity index 100%
rename from dagster/implnets/generatedCode/implnet-eco/output/workspace.yaml
rename to dagster/implnets/workflows/ecrr/ecrr/workspace.yaml
diff --git a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py
deleted file mode 100644
index cb51bb8d..00000000
--- a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ecrr_examples import harvest_ecrr_examples
-
-@job
-def implnet_job_ecrr_examples():
- harvest_ecrr_examples()
\ No newline at end of file
diff --git a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py
deleted file mode 100644
index 715b4592..00000000
--- a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from dagster import job
-
-from ops.implnet_ops_ecrr_submitted import harvest_ecrr_submitted
-
-@job
-def implnet_job_ecrr_submitted():
- harvest_ecrr_submitted()
\ No newline at end of file
diff --git a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py
deleted file mode 100644
index 89a9f681..00000000
--- a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py
+++ /dev/null
@@ -1,673 +0,0 @@
-import distutils
-
-from dagster import job, op, graph, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-from dagster import job, op, get_dagster_logger
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANER_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANER_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = os.environ.get('GLEANER_MINIO_ADDRESS')
-GLEANER_MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-GLEANER_MINIO_USE_SSL = os.environ.get('GLEANER_MINIO_USE_SSL')
-GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANER_MINIO_SECRET_KEY')
-GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANER_MINIO_ACCESS_KEY')
-GLEANER_MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-GLEANER_HEADLESS_ENDPOINT = os.environ.get('GLEANER_HEADLESS_ENDPOINT', "http://headless:9222")
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = os.environ.get('GLEANER_GRAPH_URL')
-GLEANER_GRAPH_NAMESPACE = os.environ.get('GLEANER_GRAPH_NAMESPACE')
-GLEANERIO_GLEANER_CONFIG_PATH= os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")
-GLEANERIO_NABU_CONFIG_PATH= os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")
-
-def _graphEndpoint():
- url = f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_ADDRESS'))}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(GLEANER_MINIO_BUCKET)}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'),
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL')))
- if (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 80
- and secure == False):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))
- elif (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 443
- and secure == True):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))}:{os.environ.get('GLEANER_MINIO_PORT')}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if os.environ.get('GLEANER_MINIO_USE_SSL'):
- proto = "https"
- port = os.environ.get('GLEANER_MINIO_PORT')
- address = os.environ.get('GLEANER_MINIO_ADDRESS')
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"creat docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _get_container_name(run_id, op_name, retry_number):
- container_name = hash_str(run_id + op_name)
-
- retry_number = retry_number
- if retry_number > 0:
- container_name = f"{container_name}-{retry_number}"
-
- return container_name
-
-
-def _create_container(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name=""
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"creat docker container")
- return client.containers.create(
- image,
- name=name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- detach=True,
- network=container_context.networks[0] if len(container_context.networks) else None,
- # entrypoint=entrypoint,
- command=command,
- environment=env_vars,
- **container_context.container_kwargs,
- )
-
-def gleanerio(context, mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- # CMD = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- CMD = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"gleaner01_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = CMD
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- "volumes": {
- f"{GLEANER_CONFIG_VOLUME}":
- {'bind': '/configs', 'mode': 'rw'}
- },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_container: ")
- container = _create_container(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME
- )
- except docker.errors.ImageNotFound:
- client.images.pull(IMAGE)
- container = _create_container(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME
- )
-
- if len(container_context.networks) > 1:
- for network_name in container_context.networks[1:]:
- network = client.networks.get(network_name)
- network.connect(container)
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
- DATA = s3reader(ARCHIVE_FILE)
- container.put_archive(ARCHIVE_PATH,DATA )
-
-
- ## ------------ Start
- ## note new issue:
- # {"message": "starting container with non-empty request body was deprecated since API v1.22 and removed in v1.24"}
- EMPTY_DATA="{}".encode('utf-8')
- url = URL + 'containers/' + cid + '/start'
- get_dagster_logger().info(f"Container start url: {url}")
- req = request.Request(url,data=EMPTY_DATA, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- try:
- r = request.urlopen(req)
- except HTTPError as err:
- get_dagster_logger().fatal(f"Container Start failed: {str(err.code)} reason: {err.reason}")
- raise err
- except Exception as err:
- print("failed to start container: unknown reason: ", err)
- get_dagster_logger().info(f"Create Failed: unknown reason {str(err)}")
- raise err
- print(r.status)
- get_dagster_logger().info(f"Start container: {str(r.status)}")
-
- # container.start()
- # client.api.start(container=container.id)
- ## start is not working
-
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
-
- # ## ------------ Wait expect 200
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=True).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: {str(r.status)}")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{str(mode)}_runlogs")
-
- #
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- if (cid):
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- else:
- get_dagster_logger().info(f"Container Not created, so not removed.")
- else:
- get_dagster_logger().info(f"Container NOT Remove: DEBUG ENABLED")
-
-
- return 0
-
-@op
-def ecrr_examples_gleaner(context)-> str:
- returned_value = gleanerio(context, ("gleaner"), "ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ecrr_examples_nabu_prune(context, msg: str)-> str:
- returned_value = gleanerio(context,("nabu"), "ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_examples_nabuprov(context, msg: str)-> str:
- returned_value = gleanerio(context,("prov"), "ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_examples_nabuorg(context, msg: str)-> str:
- returned_value = gleanerio(context,("orgs"), "ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_examples_naburelease(context, msg: str) -> str:
- returned_value = gleanerio(context,("release"), "ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-@op
-def ecrr_examples_uploadrelease(context, msg: str) -> str:
- returned_value = postRelease("ecrr_examples")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-
-@op
-def ecrr_examples_missingreport_s3(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_examples"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- return msg + r
-@op
-def ecrr_examples_missingreport_graph(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_examples"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
-
- return msg + r
-@op
-def ecrr_examples_graph_reports(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_examples"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
-
- return msg + r
-
-@op
-def ecrr_examples_identifier_stats(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_examples"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- return msg + r
-
-@op()
-def ecrr_examples_bucket_urls(context, msg: str) -> str:
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_examples"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- return msg + r
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="ecrr_examples"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="ecrr_examples"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_ecrr_examples():
- pass
-# # harvest = ecrr_examples_gleaner()
-#
-# # report_ms3 = ecrr_examples_missingreport_s3(harvest)
-# report_idstat = ecrr_examples_identifier_stats(report_ms3)
-# # for some reason, this causes a msg parameter missing
-# report_bucketurl = ecrr_examples_bucket_urls(report_idstat)
-#
-# #report1 = missingreport_s3(harvest, source="ecrr_examples")
-# load_release = ecrr_examples_naburelease(harvest)
-# load_uploadrelease = ecrr_examples_uploadrelease(load_release)
-#
-# load_prune = ecrr_examples_nabu_prune(load_uploadrelease)
-# load_prov = ecrr_examples_nabuprov(load_prune)
-# load_org = ecrr_examples_nabuorg(load_prov)
-#
-# # run after load
-# report_msgraph=ecrr_examples_missingreport_graph(load_org)
-# report_graph=ecrr_examples_graph_reports(report_msgraph)
-
-
-
-
diff --git a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py
deleted file mode 100644
index b286817b..00000000
--- a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py
+++ /dev/null
@@ -1,672 +0,0 @@
-import distutils
-
-from dagster import job, op, graph, get_dagster_logger
-import os, json, io
-import urllib
-from urllib import request
-from urllib.error import HTTPError
-from dagster import job, op, get_dagster_logger
-from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner
-import json
-
-from minio import Minio
-from minio.error import S3Error
-from datetime import datetime
-from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo
-from ec.datastore import s3
-import requests
-import logging as log
-from urllib.error import HTTPError
-
-from typing import Any, Mapping, Optional, Sequence
-
-import docker
-from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op
-from dagster._annotations import experimental
-from dagster._core.utils import parse_env_var
-from dagster._serdes.utils import hash_str
-
-from dagster_docker.container_context import DockerContainerContext
-from dagster_docker.docker_run_launcher import DockerRunLauncher
-from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
-
-DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true')
-# volume and netowrk need to be the names in docker, and not the names of the object in docker compose
-GLEANER_CONFIG_VOLUME=os.environ.get('GLEANER_CONFIG_VOLUME', "dagster_gleaner_configs")
-# Vars and Envs
-GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANER_HEADLESS_NETWORK', "headless_gleanerio")
-# env items
-URL = os.environ.get('PORTAINER_URL')
-APIKEY = os.environ.get('PORTAINER_KEY')
-
-
-GLEANER_MINIO_ADDRESS = os.environ.get('GLEANER_MINIO_ADDRESS')
-GLEANER_MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT')
-GLEANER_MINIO_USE_SSL = os.environ.get('GLEANER_MINIO_USE_SSL')
-GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANER_MINIO_SECRET_KEY')
-GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANER_MINIO_ACCESS_KEY')
-GLEANER_MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET')
-GLEANER_HEADLESS_ENDPOINT = os.environ.get('GLEANER_HEADLESS_ENDPOINT', "http://headless:9222")
-# using GLEANER, even though this is a nabu property... same prefix seems easier
-GLEANER_GRAPH_URL = os.environ.get('GLEANER_GRAPH_URL')
-GLEANER_GRAPH_NAMESPACE = os.environ.get('GLEANER_GRAPH_NAMESPACE')
-GLEANERIO_GLEANER_CONFIG_PATH= os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")
-GLEANERIO_NABU_CONFIG_PATH= os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")
-
-def _graphEndpoint():
- url = f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
- return url
-
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-def read_file_bytestream(image_path):
- data = open(image_path, 'rb').read()
- return data
-
-
-def load_data(file_or_url):
- try:
- with urllib.request.urlopen(file_or_url) as f:
- data = f.read()
- except ValueError:
- with open(file_or_url, 'rb') as f:
- data = f.read()
- return data
-
-
-def s3reader(object):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) + ":" + os.environ.get('GLEANER_MINIO_PORT')
- get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_ADDRESS'))}")
- get_dagster_logger().info(f"S3 PYTHON SERVER : {server}")
- get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}")
- # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}")
- get_dagster_logger().info(f"S3 BUCKET : {str(GLEANER_MINIO_BUCKET)}")
- get_dagster_logger().info(f"S3 object : {str(object)}")
-
- client = Minio(
- server,
- # secure=True,
- secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'),
- )
- try:
- data = client.get_object(GLEANER_MINIO_BUCKET, object)
- return data
- except S3Error as err:
- get_dagster_logger().info(f"S3 read error : {str(err)}")
-
-
-def s3loader(data, name):
- secure= bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL')))
- if (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 80
- and secure == False):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))
- elif (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 443
- and secure == True):
- server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))
- else:
- # it's not on a normal port
- server = f"{_pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))}:{os.environ.get('GLEANER_MINIO_PORT')}"
-
- client = Minio(
- server,
- secure=secure,
- #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))),
- access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'),
- secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'),
- )
-
- # Make 'X' bucket if not exist.
- # found = client.bucket_exists("X")
- # if not found:
- # client.make_bucket("X")
- # else:
- # print("Bucket 'X' already exists")
-
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
-
- logname = name + '_{}.log'.format(date_string)
- objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname
- f = io.BytesIO()
- #length = f.write(bytes(json_str, 'utf-8'))
- length = f.write(data)
- f.seek(0)
- client.put_object(GLEANER_MINIO_BUCKET,
- objPrefix,
- f, #io.BytesIO(data),
- length, #len(data),
- content_type="text/plain"
- )
- get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
-def postRelease(source):
- # revision of EC utilities, will have a insertFromURL
- #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
- proto = "http"
-
- if os.environ.get('GLEANER_MINIO_USE_SSL'):
- proto = "https"
- port = os.environ.get('GLEANER_MINIO_PORT')
- address = os.environ.get('GLEANER_MINIO_ADDRESS')
- bucket = GLEANER_MINIO_BUCKET
- path = "graphs/latest"
- release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq"
- url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
- get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
- r = requests.post(url)
- log.debug(f' status:{r.status_code}') # status:404
- get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
- if r.status_code == 200:
- # ''
- if 'data modified="0"' in r.text:
- get_dagster_logger().info(f'graph: no data inserted ')
- raise Exception("No Data Added: " + r.text)
- return True
- else:
- get_dagster_logger().info(f'graph: error')
- raise Exception(f' graph: insert failed: status:{r.status_code}')
-
-def _get_client(docker_container_context: DockerContainerContext):
- headers = {'X-API-Key': APIKEY}
- client = docker.DockerClient(base_url=URL, version="1.43" )
- #client = docker.APIClient(base_url=URL, version="1.35")
- get_dagster_logger().info(f"creat docker client")
- if (client.api._general_configs):
- client.api._general_configs["HttpHeaders"] = headers
- else:
- client.api._general_configs={"HttpHeaders":headers}
- client.api.headers['X-API-Key'] = APIKEY
- get_dagster_logger().info(f" docker version {client.version()}")
- if docker_container_context.registry:
- client.login(
- registry=docker_container_context.registry["url"],
- username=docker_container_context.registry["username"],
- password=docker_container_context.registry["password"],
- )
- return client
-
-
-def _get_container_name(run_id, op_name, retry_number):
- container_name = hash_str(run_id + op_name)
-
- retry_number = retry_number
- if retry_number > 0:
- container_name = f"{container_name}-{retry_number}"
-
- return container_name
-
-
-def _create_container(
- op_context: OpExecutionContext,
- client,
- container_context: DockerContainerContext,
- image: str,
- entrypoint: Optional[Sequence[str]],
- command: Optional[Sequence[str]],
- name=""
-):
- env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
- get_dagster_logger().info(f"creat docker container")
- return client.containers.create(
- image,
- name=name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
- detach=True,
- network=container_context.networks[0] if len(container_context.networks) else None,
- # entrypoint=entrypoint,
- command=command,
- environment=env_vars,
- **container_context.container_kwargs,
- )
-
-def gleanerio(context, mode, source):
- ## ------------ Create
-
- get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
- if str(mode) == "gleaner":
- IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH')
- # CMD = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
- CMD = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
- NAME = f"gleaner01_{source}_{str(mode)}"
- WorkingDir = "/gleaner/"
- #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
- # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
- elif (str(mode) == "nabu"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "prov"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "orgs"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- elif (str(mode) == "release"):
- IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE')
- ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT')
- ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH')
- CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
- NAME = f"nabu01_{source}_{str(mode)}"
- WorkingDir = "/nabu/"
- Entrypoint = "nabu"
- # LOGFILE = 'log_nabu.txt' # only used for local log file writing
- else:
- return 1
-
- # from docker0dagster
- run_container_context = DockerContainerContext.create_for_run(
- context.dagster_run,
- context.instance.run_launcher
- if isinstance(context.instance.run_launcher, DockerRunLauncher)
- else None,
- )
- validate_docker_image(IMAGE)
-
- try:
- # setup data/body for container create
- data = {}
- data["Image"] = IMAGE
- data["WorkingDir"] = WorkingDir
- #data["Entrypoint"] = Entrypoint
- data["Cmd"] = CMD
-#### gleaner
- # v.BindEnv("minio.address", "MINIO_ADDRESS")
- # v.BindEnv("minio.port", "MINIO_PORT")
- # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
- # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
- # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
- # v.BindEnv("minio.bucket", "MINIO_BUCKET")
- # // v.BindEnv("minio.region", "MINIO_REGION")
- # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
- # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
- # v.BindEnv("sparql.username", "SPARQL_USERNAME")
- # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
- # v.BindEnv("s3.domain", "S3_DOMAIN")
-### gleaner summoner config
- # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
- # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
- # viperSubtree.BindEnv("mode", "GLEANER_MODE")
-
- #### NABU config
- # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
- # minioSubtress.BindEnv("port", "MINIO_PORT")
- # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
- # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
- # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
- ###### nabu sparql config
- # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
- # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
- # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
- # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
- # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
- # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
- ### NABU object
- # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
- # viperSubtree.BindEnv("domain", "S3_DOMAIN")
- # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
-
- # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
- enva = []
- enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS)))
- enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT)))
- enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL)))
- enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY)))
- enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY)))
- enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET)))
- enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint())))
- enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT)))
- enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK)))
-
- data["Env"] = enva
- data["HostConfig"] = {
- "NetworkMode": GLEANER_HEADLESS_NETWORK,
- "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
- }
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
-
-# docker dagster
- get_dagster_logger().info(f"start docker code region: ")
- # trying to get headers in:
- # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
- op_container_context = DockerContainerContext(
- # registry=registry,
- env_vars=enva,
- networks=[GLEANER_HEADLESS_NETWORK],
- container_kwargs={"working_dir": data["WorkingDir"],
- "volumes": {
- f"{GLEANER_CONFIG_VOLUME}":
- {'bind': '/configs', 'mode': 'rw'}
- },
-
-
- },
- )
- container_context = run_container_context.merge(op_container_context)
- get_dagster_logger().info(f"call docker _get_client: ")
- client = _get_client(container_context)
-
- try:
- get_dagster_logger().info(f"try docker _create_container: ")
- container = _create_container(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME
- )
- except docker.errors.ImageNotFound:
- client.images.pull(IMAGE)
- container = _create_container(
- context, client, container_context, IMAGE, "", data["Cmd"], name=NAME
- )
-
- if len(container_context.networks) > 1:
- for network_name in container_context.networks[1:]:
- network = client.networks.get(network_name)
- network.connect(container)
-
- cid = container.id # legacy til the start get's fixed
-
-
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
- DATA = s3reader(ARCHIVE_FILE)
- container.put_archive(ARCHIVE_PATH,DATA )
-
-
- ## ------------ Start
- ## note new issue:
- # {"message": "starting container with non-empty request body was deprecated since API v1.22 and removed in v1.24"}
- EMPTY_DATA="{}".encode('utf-8')
- url = URL + 'containers/' + cid + '/start'
- get_dagster_logger().info(f"Container start url: {url}")
- req = request.Request(url,data=EMPTY_DATA, method="POST")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- try:
- r = request.urlopen(req)
- except HTTPError as err:
- get_dagster_logger().fatal(f"Container Start failed: {str(err.code)} reason: {err.reason}")
- raise err
- except Exception as err:
- print("failed to start container: unknown reason: ", err)
- get_dagster_logger().info(f"Create Failed: unknown reason {str(err)}")
- raise err
- print(r.status)
- get_dagster_logger().info(f"Start container: {str(r.status)}")
-
- # container.start()
- # client.api.start(container=container.id)
- ## start is not working
-
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
-
- # ## ------------ Wait expect 200
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=True).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: {str(r.status)}")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
- # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
- # pw_tar.extractall("extract_to/")
-
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{str(mode)}_runlogs")
-
- #
- if exit_status != 0:
- raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
- finally:
- if (not DEBUG) :
- if (cid):
- url = URL + 'containers/' + cid
- req = request.Request(url, method="DELETE")
- req.add_header('X-API-Key', APIKEY)
- # req.add_header('content-type', 'application/json')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
- print(r.status)
- get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- else:
- get_dagster_logger().info(f"Container Not created, so not removed.")
- else:
- get_dagster_logger().info(f"Container NOT Remove: DEBUG ENABLED")
-
-
- return 0
-
-@op
-def ecrr_submitted_gleaner(context)-> str:
- returned_value = gleanerio(context, ("gleaner"), "ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- get_dagster_logger().info(f"Gleaner notes are {r} ")
- return r
-
-@op
-def ecrr_submitted_nabu_prune(context, msg: str)-> str:
- returned_value = gleanerio(context,("nabu"), "ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_submitted_nabuprov(context, msg: str)-> str:
- returned_value = gleanerio(context,("prov"), "ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_submitted_nabuorg(context, msg: str)-> str:
- returned_value = gleanerio(context,("orgs"), "ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-@op
-def ecrr_submitted_naburelease(context) -> str:
- returned_value = gleanerio(context,("release"), "ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- return r
-@op
-def ecrr_submitted_uploadrelease(context, msg: str) -> str:
- returned_value = postRelease("ecrr_submitted")
- r = str('returned value:{}'.format(returned_value))
- return msg + r
-
-
-@op
-def ecrr_submitted_missingreport_s3(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_submitted"
- graphendpoint = None
- milled = False
- summon = True
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing repoort returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
- s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report)
- return msg + r
-@op
-def ecrr_submitted_missingreport_graph(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted")
- source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_submitted"
-
- graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = True
- summon = False # summon only off
- returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
- r = str('missing report graph returned value:{}'.format(returned_value))
- report = json.dumps(returned_value, indent=2)
-
- s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report)
-
- return msg + r
-@op
-def ecrr_submitted_graph_reports(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted")
- #source_url = source.get('url')
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_submitted"
-
- graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
-
- milled = False
- summon = True
- returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
- r = str('returned value:{}'.format(returned_value))
- #report = json.dumps(returned_value, indent=2) # value already json.dumps
- report = returned_value
- s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
-
- return msg + r
-
-@op
-def ecrr_submitted_identifier_stats(context, msg: str) -> str:
- source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted")
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_submitted"
-
- returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
- r = str('returned value:{}'.format(returned_value))
- #r = str('identifier stats returned value:{}'.format(returned_value))
- report = returned_value.to_json()
- s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
- return msg + r
-
-@op()
-def ecrr_submitted_bucket_urls(context, msg: str) -> str:
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
- bucket = GLEANER_MINIO_BUCKET
- source_name = "ecrr_submitted"
-
- res = s3Minio.listSummonedUrls(bucket, source_name)
- r = str('returned value:{}'.format(res))
- bucketurls = json.dumps(res, indent=2)
- s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls)
- return msg + r
-
-
-#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="ecrr_submitted"):
-#
-# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
-# source_url = source.get('url')
-# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None)
-# bucket = GLEANER_MINIO_BUCKET
-# source_name="ecrr_submitted"
-#
-# graphendpoint = None
-# milled = False
-# summon = True
-# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
-# r = str('returned value:{}'.format(returned_value))
-# return msg + r
-@graph
-def harvest_ecrr_submitted():
-# harvest = ecrr_submitted_gleaner()
-
-# report_ms3 = ecrr_submitted_missingreport_s3(harvest)
-# report_idstat = ecrr_submitted_identifier_stats(report_ms3)
- # for some reason, this causes a msg parameter missing
-# report_bucketurl = ecrr_submitted_bucket_urls(report_idstat)
-
- #report1 = missingreport_s3(harvest, source="ecrr_submitted")
- load_release = ecrr_submitted_naburelease()
- load_uploadrelease = ecrr_submitted_uploadrelease(load_release)
-
- load_prune = ecrr_submitted_nabu_prune(load_uploadrelease)
- load_prov = ecrr_submitted_nabuprov(load_prune)
- load_org = ecrr_submitted_nabuorg(load_prov)
-
-# run after load
-# report_msgraph=ecrr_submitted_missingreport_graph(load_org)
- report_graph=ecrr_submitted_graph_reports(load_release)
-
-
-
-
diff --git a/dagster/implnets/workflows/ecrr/pyproject.toml b/dagster/implnets/workflows/ecrr/pyproject.toml
new file mode 100644
index 00000000..f0d202f2
--- /dev/null
+++ b/dagster/implnets/workflows/ecrr/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.dagster]
+module_name = "ecrr"
diff --git a/dagster/implnets/workflows/ecrr/repositories/repository.py b/dagster/implnets/workflows/ecrr/repositories/repository.py
deleted file mode 100644
index 263e13aa..00000000
--- a/dagster/implnets/workflows/ecrr/repositories/repository.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from dagster import repository
-from jobs.implnet_jobs_ecrr_submitted import implnet_job_ecrr_submitted
-from sch.implnet_sch_ecrr_submitted import implnet_sch_ecrr_submitted
-from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples
-from sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples
-
-@repository(name="ecrr")
-def ecrr():
- jobs = [implnet_job_ecrr_submitted, implnet_job_ecrr_examples]
- schedules = [implnet_sch_ecrr_submitted, implnet_sch_ecrr_examples]
-
-
- return jobs + schedules
diff --git a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py
deleted file mode 100644
index d7bba855..00000000
--- a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples
-
-@schedule(cron_schedule="0 12 * * 3", job=implnet_job_ecrr_examples, execution_timezone="US/Central")
-def implnet_sch_ecrr_examples(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py
deleted file mode 100644
index e6c739d9..00000000
--- a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dagster import schedule
-
-from jobs.implnet_jobs_ecrr_submitted import implnet_job_ecrr_submitted
-
-@schedule(cron_schedule="0 0 * * 0", job=implnet_job_ecrr_submitted, execution_timezone="US/Central")
-def implnet_sch_ecrr_submitted(_context):
- run_config = {}
- return run_config
diff --git a/dagster/implnets/workflows/ecrr/tests/__init__.py b/dagster/implnets/workflows/ecrr/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/workflows/ingest/NOTES.md b/dagster/implnets/workflows/ingest/NOTES.md
new file mode 100644
index 00000000..3b9cd820
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/NOTES.md
@@ -0,0 +1,26 @@
+
+# Schedules
+
+It's hard to a set of dynamic Schedules, with varying crons
+https://github.com/dagster-io/dagster/discussions/22121
+
+Right now, all sources will run weekly
+
+while not ideal, I think we could setup three schedules: daily, weekly, monthly and quarterly.
+Then if a the cron in the source matched, a run would occur.
+
+more complex would be having something that ran (hourly),
+and go through the list of sources, and last runs, and if it was time to run, then run that source.
+Basically put an evaluation function in before seeing if a run should occur,
+if it should do add run request to list of run requests for time, then return that list.
+
+
+How do I write a sensor or schedule that requests a run for every partition on every tick?
+https://github.com/dagster-io/dagster/discussions/15532
+
+partiton metadata about last run: https://github.com/dagster-io/dagster/discussions/14338
+How to ensure the previous partition of a job has succeeded before running the next partition https://github.com/dagster-io/dagster/discussions/10264
+
+dynamic partitions
+https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets
+
diff --git a/dagster/implnets/workflows/ingest/README.md b/dagster/implnets/workflows/ingest/README.md
new file mode 100644
index 00000000..cea90599
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/README.md
@@ -0,0 +1,53 @@
+# Ingest Rework
+
+This is an attempt to rework the ingest system, to split the summon/release file from the load to graph
+and clean graph, and the reporting.
+
+**the model is that**
+1. we read a list of sources. In long term this will be a file in an s3 bucket with just the gleanerio source information
+2. for each source, we harvest (summon, create release, (optional flag: summarize, load summarize)... in the long term, this will need to create a dynamic schedule
+3. generate reports and stats
+4. read and create communities (from an s3 location?)
+ 5. all
+ 6. customized
+5. update community sensor
+ 6. when a source is updated, update the community
+
+## gleaner io container routines
+* summon : run gleaner, run nabu release
+ * assets -> summon path (metadata: s3:path file count, time), release file (metadata: s3path, size, time), reports
+* relase
+* prune
+* prov
+* orgs
+
+## ops:
+ * Load to graph
+ * summarize
+ * load summarize
+ * reports
+ * graph (prune, prov, orgs)
+ * community stats
+ * UI
+## Sensor:
+These routines are useful to all communities.
+
+* new release file
+ * run prov
+ * run bucket report, missing report, identifier report
+ * run summarize. Not needed by all communiities, but prevents duplicate op from being run. Can add a flag.
+
+# Sensor for a community
+Have a sensor that looks at the release files, and then determines if a release needs to be pushed to a communit
+if this release is a source in my community.
+ * load graph
+ * graph report
+ * load prov
+ * load summarize
+ * (ec) run community stats (about)
+
+
+
+This is a [Dagster](https://dagster.io/) project made to be used alongside the official [Dagster tutorial](https://docs.dagster.io/tutorial).
+
+Use Dagster AWS for minio configuraiton
diff --git a/dagster/implnets/workflows/ingest/data/tenant.yaml b/dagster/implnets/workflows/ingest/data/tenant.yaml
new file mode 100644
index 00000000..60fe2095
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tenant.yaml
@@ -0,0 +1,29 @@
+# prototype tennants file
+
+# prototype tennants file
+
+tenant:
+ - community: dev
+ hostname: geocodes-dev
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: test
+ summary_namespace: test_summary
+ sources:
+ - iris
+ - opentopography
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes_test
+ summary_namespace: geocodes_test_summary
+ sources:
+ - all
diff --git a/dagster/implnets/workflows/ingest/data/tenant_dev.yaml b/dagster/implnets/workflows/ingest/data/tenant_dev.yaml
new file mode 100644
index 00000000..dee0699c
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tenant_dev.yaml
@@ -0,0 +1,28 @@
+# prototype tennants file
+
+tenant:
+ - community: dev
+ hostname: geocodes-dev
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: test
+ summary_namespace: test_summary
+ sources:
+ - iris
+ - opentopography
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes_test
+ summary_namespace: geocodes_test_summary
+ sources:
+ - all
+
diff --git a/dagster/implnets/workflows/ingest/data/tenant_prod.yaml b/dagster/implnets/workflows/ingest/data/tenant_prod.yaml
new file mode 100644
index 00000000..9661df09
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tenant_prod.yaml
@@ -0,0 +1,40 @@
+# prototype tennants file
+
+tenant:
+ - community: production
+ hostname: geocodes-aws
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes
+ summary_namespace: geocodes_summary
+ sources:
+ - bcodmo
+ - r2r
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: earthcube
+ summary_namespace: earthcube_summary
+ sources:
+ - all
+######
+ - community: deepoceans
+ hostname: oceans
+ description: this is a test 1
+ name: A community description
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: oceans
+ summary_namespace: oceans_summary
+ sources:
+ - bcodmo
+ - r2r
diff --git a/dagster/implnets/workflows/ingest/data/tennant.yaml b/dagster/implnets/workflows/ingest/data/tennant.yaml
new file mode 100644
index 00000000..08c7d57b
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tennant.yaml
@@ -0,0 +1,29 @@
+# prototype tennants file
+
+# prototype tennants file
+
+tennant:
+ - community: dev
+ hostname: geocodes-dev
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: test
+ summary_namespace: test_summary
+ sources:
+ - iris
+ - opentopography
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes_test
+ summary_namespace: geocodes_test_summary
+ sources:
+ - all
diff --git a/dagster/implnets/workflows/ingest/data/tennant_dev.yaml b/dagster/implnets/workflows/ingest/data/tennant_dev.yaml
new file mode 100644
index 00000000..5a83443c
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tennant_dev.yaml
@@ -0,0 +1,28 @@
+# prototype tennants file
+
+tennant:
+ - community: dev
+ hostname: geocodes-dev
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: test
+ summary_namespace: test_summary
+ sources:
+ - iris
+ - opentopography
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes_test
+ summary_namespace: geocodes_test_summary
+ sources:
+ - all
+
diff --git a/dagster/implnets/workflows/ingest/data/tennant_prod.yaml b/dagster/implnets/workflows/ingest/data/tennant_prod.yaml
new file mode 100644
index 00000000..defc8bd7
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/data/tennant_prod.yaml
@@ -0,0 +1,40 @@
+# prototype tennants file
+
+tennant:
+ - community: production
+ hostname: geocodes-aws
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: geocodes
+ summary_namespace: geocodes_summary
+ sources:
+ - bcodmo
+ - r2r
+######
+ - community: geocodesall
+ hostname: geocodes-all
+ description: GeoCodes is...
+ name: Geocodes Science on Schema
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: earthcube
+ summary_namespace: earthcube_summary
+ sources:
+ - all
+######
+ - community: deepoceans
+ hostname: oceans
+ description: this is a test 1
+ name: A community description
+ url: https://www.earthcube.org
+ logo: https://unsplash.com/random
+ graph:
+ main_namespace: oceans
+ summary_namespace: oceans_summary
+ sources:
+ - bcodmo
+ - r2r
diff --git a/dagster/implnets/workflows/ingest/ingest/__init__.py b/dagster/implnets/workflows/ingest/ingest/__init__.py
new file mode 100644
index 00000000..0baa2355
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/__init__.py
@@ -0,0 +1,240 @@
+########### NOTES ON THIS ####
+# the resources need to be correct for the code to run,
+# * fields need to be defined. they cannot be
+
+# BlaszegraphResource(),
+
+# need have definitions.
+
+# BlazegraphResource(
+# GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'),
+# GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+# )
+#### QUIRKS ###
+# if a type is changed in a configuraiton, you need to change all the configs, and not just one.
+# so when
+
+import os
+
+from dagster import Definitions, load_assets_from_modules, EnvVar
+from dagster_aws.s3.resources import S3Resource
+from dagster_aws.s3.ops import S3Coordinate
+from dagster import (
+ AssetSelection,
+ Definitions,
+ define_asset_job,
+)
+from dagster_slack import SlackResource, make_slack_on_run_failure_sensor
+
+from .resources.graph import BlazegraphResource, GraphResource
+from .resources.gleanerio import GleanerioResource
+from .resources.gleanerS3 import gleanerS3Resource
+from .assets import (
+ gleanerio_run,
+ release_nabu_run
+)
+
+from .jobs.summon_assets import summon_asset_job
+from .jobs import (
+ summon_asset_job, sources_asset_job,
+ sources_partitions_def
+ ,tenant_asset_job,
+ tenant_namespaces_job,
+ release_asset_job
+)
+
+jobs = [
+summon_asset_job, sources_asset_job,
+ tenant_asset_job,
+ tenant_namespaces_job,
+ release_asset_job
+]
+from pydantic import Field
+
+from . import assets
+from .utils import PythonMinioAddress
+
+
+all_assets = load_assets_from_modules([assets])
+
+#harvest_job = define_asset_job(name="harvest_job", selection="harvest_and_release")
+
+from .sensors import (
+ release_file_sensor,
+release_file_sensor_v2,
+ sources_sensor,
+ tenant_names_sensor,
+ sources_s3_sensor,
+ tenant_s3_sensor,
+#tenant_names_sensor_v2
+)
+
+slack_on_run_failure = make_slack_on_run_failure_sensor(
+ os.getenv("SLACK_CHANNEL"),
+ os.getenv("SLACK_TOKEN")
+)
+all_sensors = [
+ slack_on_run_failure,
+ # release_file_sensor,
+release_file_sensor_v2,
+ sources_sensor, # original code. Now use a schedule
+ tenant_names_sensor,
+ sources_s3_sensor,
+ tenant_s3_sensor,
+#tenant_names_sensor_v2
+ ]
+
+from .sensors.gleaner_summon import sources_schedule
+
+all_schedules = [sources_schedule]
+
+def _awsEndpointAddress(url, port=None, use_ssl=True):
+ if use_ssl:
+ protocol = "https"
+ else:
+ protocol = "http"
+ if port is not None:
+ return f"{protocol}://{url}:{port}"
+ else:
+ return f"{protocol}://{url}"
+
+s3=S3Resource(
+ endpoint_url =_awsEndpointAddress(
+ EnvVar('GLEANERIO_MINIO_ADDRESS').get_value(),
+ port=EnvVar('GLEANERIO_MINIO_PORT').get_value(),
+ use_ssl=EnvVar('GLEANERIO_MINIO_USE_SSL').get_value()
+ ),
+ aws_access_key_id=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'),
+ aws_secret_access_key=EnvVar('GLEANERIO_MINIO_SECRET_KEY')
+)
+gleaners3=gleanerS3Resource(
+ # GLEANER_MINIO_BUCKET =EnvVar('GLEANER_MINIO_BUCKET'),
+ # GLEANER_MINIO_ADDRESS=EnvVar('GLEANER_MINIO_ADDRESS'),
+ # GLEANER_MINIO_PORT=EnvVar('GLEANER_MINIO_PORT'),
+ GLEANERIO_MINIO_BUCKET=EnvVar('GLEANERIO_MINIO_BUCKET'),
+ GLEANERIO_MINIO_ADDRESS=EnvVar('GLEANERIO_MINIO_ADDRESS'),
+ GLEANERIO_MINIO_PORT=EnvVar('GLEANERIO_MINIO_PORT'),
+ GLEANERIO_MINIO_USE_SSL=os.environ.get('GLEANERIO_MINIO_USE_SSL', "True"),
+ GLEANERIO_MINIO_ACCESS_KEY=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'),
+ GLEANERIO_MINIO_SECRET_KEY=EnvVar('GLEANERIO_MINIO_SECRET_KEY'),
+ GLEANERIO_CONFIG_PATH=os.environ.get('GLEANERIO_CONFIG_PATH'),
+ GLEANERIO_SOURCES_FILENAME=os.environ.get('GLEANERIO_SOURCES_FILENAME'),
+ GLEANERIO_TENANT_FILENAME=os.environ.get('GLEANERIO_TENANT_FILENAME'),
+ # this is S3. It is the s3 resource
+ s3=s3
+
+)
+triplestore=BlazegraphResource(
+ GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'),
+ GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+ gs3=gleaners3,
+ )
+triplestore_summary=BlazegraphResource(
+ GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'),
+ GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'),
+ gs3=gleaners3,
+ )
+
+resources = {
+ "local": {
+ "gleanerio": GleanerioResource(
+# DEBUG=os.environ.get('DEBUG'),
+ DEBUG_CONTAINER=False,
+ GLEANERIO_DOCKER_URL=EnvVar('GLEANERIO_DOCKER_URL'),
+ GLEANERIO_PORTAINER_APIKEY=EnvVar('GLEANERIO_PORTAINER_APIKEY'),
+
+ GLEANERIO_DOCKER_HEADLESS_NETWORK=EnvVar('GLEANERIO_DOCKER_HEADLESS_NETWORK'),
+ GLEANERIO_HEADLESS_ENDPOINT=EnvVar('GLEANERIO_HEADLESS_ENDPOINT'),
+
+ GLEANERIO_GLEANER_IMAGE=EnvVar('GLEANERIO_GLEANER_IMAGE'),
+ GLEANERIO_NABU_IMAGE=EnvVar('GLEANERIO_NABU_IMAGE'),
+
+ GLEANERIO_DAGSTER_CONFIG_PATH=EnvVar('GLEANERIO_DAGSTER_CONFIG_PATH'),
+
+
+ GLEANERIO_DOCKER_NABU_CONFIG=EnvVar('GLEANERIO_DOCKER_NABU_CONFIG'),
+ GLEANERIO_DOCKER_GLEANER_CONFIG=EnvVar('GLEANERIO_DOCKER_GLEANER_CONFIG'),
+
+ GLEANERIO_NABU_CONFIG_PATH=EnvVar('GLEANERIO_NABU_CONFIG_PATH'),
+ GLEANERIO_GLEANER_CONFIG_PATH=EnvVar('GLEANERIO_GLEANER_CONFIG_PATH'),
+
+ GLEANERIO_LOG_PREFIX=EnvVar('GLEANERIO_LOG_PREFIX'),
+
+ GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',600),
+ GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+ GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'),
+ gs3=gleaners3,
+ # s3=gleanerS3Resource(
+ # GLEANERIO_MINIO_ADDRESS="oss.geocodes-aws-dev.earthcube.org",
+ # GLEANERIO_MINIO_PORT=443,
+ # GLEANERIO_MINIO_USE_SSL=True,
+ # GLEANERIO_MINIO_BUCKET="test",
+ # GLEANERIO_MINIO_ACCESS_KEY="worldsbestaccesskey",
+ # GLEANERIO_MINIO_SECRET_KEY="worldsbestsecretkey",
+ # ),
+ triplestore=triplestore,
+ # triplestore=BlazegraphResource(
+ # GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'),
+ # GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+ # ),
+ triplestore_summary=triplestore_summary
+ ), # gleaner
+ "s3":s3,
+ "gs3":gleaners3,
+ "triplestore": triplestore,
+ "slack": SlackResource(token=EnvVar("SLACK_TOKEN")),
+ },
+ "production": {
+ "gleanerio": GleanerioResource(
+ DEBUG_CONTAINER=False,
+
+ GLEANERIO_DOCKER_URL=EnvVar('GLEANERIO_DOCKER_URL'),
+ GLEANERIO_PORTAINER_APIKEY=EnvVar('GLEANERIO_PORTAINER_APIKEY'),
+
+ GLEANERIO_DOCKER_HEADLESS_NETWORK=EnvVar('GLEANERIO_DOCKER_HEADLESS_NETWORK'),
+ GLEANERIO_HEADLESS_ENDPOINT=EnvVar('GLEANERIO_HEADLESS_ENDPOINT'),
+
+ GLEANERIO_GLEANER_IMAGE=EnvVar('GLEANERIO_GLEANER_IMAGE'),
+ GLEANERIO_NABU_IMAGE=EnvVar('GLEANERIO_NABU_IMAGE'),
+
+ GLEANERIO_DAGSTER_CONFIG_PATH=EnvVar('GLEANERIO_DAGSTER_CONFIG_PATH'),
+
+
+ GLEANERIO_DOCKER_NABU_CONFIG=EnvVar('GLEANERIO_DOCKER_NABU_CONFIG'),
+ GLEANERIO_DOCKER_GLEANER_CONFIG=EnvVar('GLEANERIO_DOCKER_GLEANER_CONFIG'),
+
+ GLEANERIO_NABU_CONFIG_PATH=EnvVar('GLEANERIO_NABU_CONFIG_PATH'),
+ GLEANERIO_GLEANER_CONFIG_PATH=EnvVar('GLEANERIO_GLEANER_CONFIG_PATH'),
+
+ GLEANERIO_LOG_PREFIX=EnvVar('GLEANERIO_LOG_PREFIX'),
+
+ GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',600),
+ GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+ GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'),
+ gs3=gleaners3,
+ triplestore=triplestore,
+ triplestore_summary=triplestore_summary,
+
+
+ ), # gleaner
+ # this nees to be s3 so s3 can find it.
+ "s3":s3,
+ "gs3":gleaners3,
+ "triplestore":triplestore,
+ "slack":SlackResource(token=EnvVar("SLACK_TOKEN")),
+ },
+}
+
+deployment_name = os.environ.get("DAGSTER_DEPLOYMENT", "local")
+
+
+
+defs = Definitions(
+ assets=all_assets,
+ resources=resources[deployment_name],
+ sensors=all_sensors,
+ jobs=jobs,
+ schedules=all_schedules
+# jobs=[harvest_job]
+
+)
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/__init__.py b/dagster/implnets/workflows/ingest/ingest/assets/__init__.py
new file mode 100644
index 00000000..52c0d72b
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/__init__.py
@@ -0,0 +1,20 @@
+from .gleaner_geocdoes_demo import gleanerio_demo
+from .gleaner_summon_assets import (
+ gleanerio_run, release_nabu_run, release_summarize,
+ load_report_s3,load_report_graph,validate_sitemap_url,
+ bucket_urls, identifier_stats,
+ graph_stats_report,
+ SUMMARY_PATH,RELEASE_PATH
+)
+from .gleaner_sources import (
+ gleanerio_orgs, gleanerio_tenants,
+ gleanerio_sources,
+ tenant_partitions_def
+ , sources_partitions_def
+)
+
+from .tenant import (
+ TenantOpConfig, TenantConfig,
+ upload_release,upload_summary,
+ create_tenant_containers, create_graph_namespaces
+)
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py
new file mode 100644
index 00000000..1f3bf8ab
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py
@@ -0,0 +1,13 @@
+
+from dagster import get_dagster_logger, asset, In, Nothing, Config
+
+from ..resources import gleanerio
+class gleaner(Config):
+ source: str
+
+ def create_gleaner_asset(self,context):
+ @asset(name=f"{self.source}_gleaner")
+ def _gleanerio():
+ gleanerio(context, ("gleaner"), self.source)
+
+ return _gleanerio()
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py
new file mode 100644
index 00000000..e0c1e9d9
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py
@@ -0,0 +1,19 @@
+# a test asset to see that all the resource configurations load.
+# basically runs the first step, of gleaner on geocodes demo datasets
+
+from dagster import get_dagster_logger, asset, In, Nothing, Config
+
+from ..resources.gleanerio import GleanerioResource
+@asset(key_prefix="ingest",required_resource_keys={"gleanerio"})
+def gleanerio_demo(context ):
+ gleaner_resource = foo = context.resources.gleanerio
+ source="geocodes_demo_datasets"
+ gleaner = gleaner_resource.execute(context, "gleaner", source )
+ context.add_output_metadata(
+ metadata={
+ "source": source, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+ )
+
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py
new file mode 100644
index 00000000..dbbf24c4
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py
@@ -0,0 +1,175 @@
+# a test asset to see that all the resource configurations load.
+# basically runs the first step, of gleaner on geocodes demo datasets
+import orjson
+
+import dagster
+from dagster import get_dagster_logger, asset,multi_asset, AssetOut, In, Nothing, Config,DynamicPartitionsDefinition, sensor
+import yaml
+from ec.sitemap import Sitemap
+
+sources_partitions_def = DynamicPartitionsDefinition(name="sources_names_active")
+#from ..resources.gleanerio import GleanerioResource
+tenant_partitions_def = DynamicPartitionsDefinition(name="tenant_names_paritition")
+### PRESENT HACK. Using the orgs
+# really needs to read a future tenant file, and then add
+# new partions with a sensor
+# need to add a sensor to add paritions when one is added
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets
+
+# for right now, using a list of orgs as the sources.
+# future read the gleaner config file.
+# future future, store sources in (s3/googlesheets) and read them.
+
+
+@asset(
+ #group_name="configs",
+ name="org_names",key_prefix="ingest",required_resource_keys={"gs3"})
+def gleanerio_orgs(context ):
+ s3_resource = context.resources.gs3
+ source="orgs_list_from_a_s3_bucket"
+ files = s3_resource.listPath(path='orgs')
+ orgs = list(map(lambda o: o["Key"].removeprefix("orgs/").removesuffix(".nq") , files))
+ dagster.get_dagster_logger().info(str(orgs))
+ context.add_output_metadata(
+ metadata={
+ "source": source, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+ )
+ #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2)
+ # this is used for partitioning, so let it pickle (aka be a python list)
+ return orgs
+#@asset(group_name="configs",name="tenant_names",required_resource_keys={"gs3"})
+@multi_asset(
+
+ outs=
+ {
+ "tenant_all": AssetOut(key_prefix="ingest",
+ group_name="configs",),
+ "tenant_names": AssetOut(key_prefix="ingest",
+ group_name="configs",),
+ }
+ ,required_resource_keys={"gs3"}
+ )
+def gleanerio_tenants(context):
+ gleaner_resource = context.resources.gs3
+ s3_resource = context.resources.gs3
+ # tennant_path = f'{s3_resource.GLEANERIO_CONFIG_PATH}{s3_resource.GLEANERIO_TENANT_FILENAME}'
+ # get_dagster_logger().info(f"tennant_path {tennant_path} ")
+ #
+ # tennant = s3_resource.getFile(path=tennant_path)
+ tenant = s3_resource.getTennatFile()
+ get_dagster_logger().info(f"tenant {tenant} ")
+ tenant_obj = yaml.safe_load(tenant)
+ tenants = list(map(lambda t: t["community"], tenant_obj["tenant"] ))
+ context.add_output_metadata(
+ metadata={
+ "source": tenants, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }, output_name="tenant_all"
+ )
+ context.add_output_metadata(
+ metadata={
+ "source": tenants, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }, output_name="tenant_names"
+ )
+ #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2)
+ # this is used for partitioning, so let it pickle (aka be a python list)
+ return tenant_obj, tenants
+
+"""
+check a soruce list, return invalid and valid sources lists
+"""
+def check_for_valid_sitemap( sources_active):
+ validated_sources=[]
+ for source in sources_active:
+ try:
+ sm = Sitemap(source['url'], no_progress_bar=True)
+
+ source['sm_url_is_valid'] = sm.validUrl()
+ validated_sources.append(source)
+ get_dagster_logger().info(f" sitemap url valid {source['sm_url_is_valid']} for {source['name']} {source['url']}")
+ except Exception as e:
+ get_dagster_logger().error(f" sitemap url ERROR for {source['name']} {source['url']} exception {e}")
+ source['sm_url_is_valid'] = False
+ validated_sources.append(source)
+ return validated_sources
+@multi_asset(
+
+ outs=
+ {
+ "sources_all": AssetOut(key_prefix="ingest",
+ group_name="configs",),
+ "sources_names_active": AssetOut(key_prefix="ingest",
+ group_name="configs",),
+"sources_names_invalid_sitemap": AssetOut(key_prefix="ingest",
+ group_name="configs",),
+ }
+ ,required_resource_keys={"gs3"})
+def gleanerio_sources(context ):
+
+ s3_resource = context.resources.gs3
+ # tennant_path = f'{s3_resource.GLEANERIO_CONFIG_PATH}{s3_resource.GLEANERIO_TENANT_FILENAME}'
+ # get_dagster_logger().info(f"tennant_path {tennant_path} ")
+ #
+ # tennant = s3_resource.getFile(path=tennant_path)
+ source = s3_resource.getSourcesFile()
+ get_dagster_logger().info(f"sources {source} ")
+ sources_obj = yaml.safe_load(source)
+ sources_all_value = list(filter(lambda t: t["name"], sources_obj["sources"]))
+ sources_active_value = filter(lambda t: t["active"], sources_all_value )
+ source_sm_validated = list(check_for_valid_sitemap( sources_active_value))
+ context.log.info(f"validated sitemaps {source_sm_validated} ")
+ sources_active_names = list(map(lambda t: t["name"], filter(lambda t: t["sm_url_is_valid"], source_sm_validated )))
+ sources_invalid_sm = list(map(lambda t: t["name"], filter(lambda t: not t["sm_url_is_valid"], source_sm_validated)))
+
+ context.add_output_metadata(
+ metadata={
+ "source": sources_active_names, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }, output_name="sources_names_active"
+ )
+ #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2)
+ # this is used for partitioning, so let it pickle (aka be a python list)
+ return sources_all_value, sources_active_names,sources_invalid_sm
+# @asset(required_resource_keys={"gs3"})
+# def gleanerio_orgs(context ):
+# s3_resource = context.resources.gs3
+# source="geocodes_demo_datasets"
+# files = s3_resource.listPath(path='orgs')
+# orgs = list(map(lambda o: o["Key"].removeprefix("orgs/").removesuffix(".nq") , files))
+# # rather than do this with an @asset_sensor, just do it here.
+# sources = orgs
+# new_sources = [
+# source
+# for source in sources
+# if not sources_partitions_def.has_partition_key(
+# source, dynamic_partitions_store=context.instance
+# )
+# ]
+# sources_partitions_def.build_add_request(new_sources)
+# # return SensorResult(
+# # run_requests=[
+# # RunRequest(partition_key=source) for source in new_sources
+# # ],
+# # dynamic_partitions_requests=[
+# # sources_partitions_def.build_add_request(new_sources)
+# # ],
+# # )
+# dagster.get_dagster_logger().info(str(orgs))
+# context.add_output_metadata(
+# metadata={
+# "source": source, # Metadata can be any key-value pair
+# "new_sources":new_sources,
+# "run": "gleaner",
+# # The `MetadataValue` class has useful static methods to build Metadata
+# }
+# )
+# #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2)
+# # this is used for partitioning, so let it pickle (aka be a python list)
+# return orgs
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py
new file mode 100644
index 00000000..c5d6e375
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py
@@ -0,0 +1,437 @@
+# a test asset to see that all the resource configurations load.
+# basically runs the first step, of gleaner on geocodes demo datasets
+from typing import Any
+import json
+import pandas as pd
+import csv
+from urllib.error import HTTPError
+
+from dagster import (
+ asset,op, Config, Output,AssetKey,
+ define_asset_job, AssetSelection,
+get_dagster_logger,BackfillPolicy
+)
+from ec.datastore import s3 as utils_s3
+from ec.sitemap import Sitemap
+from .gleaner_sources import sources_partitions_def
+from ..utils import PythonMinioAddress
+
+from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace
+from ec.reporting.report import missingReport, generateIdentifierRepo, generateGraphReportsRelease
+from ec.graph.release_graph import ReleaseGraph
+from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset
+
+SUMMARY_PATH = 'graphs/summary'
+RELEASE_PATH = 'graphs/latest'
+
+class HarvestOpConfig(Config):
+ source_name: str
+# sources_partitions_def = StaticPartitionsDefinition(
+# ["geocodes_demo_datasets", "iris"]
+# )
+
+def getSource(context, source_name):
+ sources = context.repository_def.load_asset_value(AssetKey(["ingest","sources_all"]))
+ source = list(filter(lambda t: t["name"]==source_name, sources))
+ return source[0]
+
+@asset(
+ group_name="load",
+ key_prefix="ingest",
+ deps=[AssetKey(["ingest","sources_names_active"]) ],
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+def validate_sitemap_url(context):
+ source_name = context.asset_partition_key_for_output()
+ source = getSource(context, source_name)
+ sm = Sitemap(source['url'], no_progress_bar=True)
+ if sm.validUrl():
+ return source['url']
+ else:
+ context.log.error(f"source: {source['name']} bad url: {source['url']}")
+ raise HTTPError(url=source['url'],
+ code=404,
+ hdrs=None,
+ fp=None,
+ msg=f"Bad URL ource: {source['name']} bad url: {source['url']}" )
+
+@asset(group_name="load",
+key_prefix="ingest",
+op_tags={"ingest": "docker"},
+ deps=[ validate_sitemap_url ],
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+#@asset( required_resource_keys={"gleanerio"})
+def gleanerio_run(context ) -> Output[Any]:
+ gleaner_resource = context.resources.gleanerio
+ source= context.asset_partition_key_for_output()
+ gleaner = gleaner_resource.execute(context, "gleaner", source )
+
+ metadata={
+ "source": source, # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+
+ return Output(gleaner, metadata=metadata)
+@asset(group_name="load",
+key_prefix="ingest",
+op_tags={"ingest": "docker"},
+ deps=[gleanerio_run],
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # ,backfill_policy=BackfillPolicy.single_run()
+ )
+#@asset(required_resource_keys={"gleanerio"})
+def release_nabu_run(context) -> Output[Any]:
+ gleaner_resource = context.resources.gleanerio
+ source= context.asset_partition_key_for_output()
+ nabu=gleaner_resource.execute(context, "release", source )
+ metadata={
+ "source": source, # Metadata can be any key-value pair
+ "run": "release",
+ "bucket_name": gleaner_resource.gs3.GLEANERIO_MINIO_BUCKET, # Metadata can be any key-value pair
+ "object_name": f"{RELEASE_PATH}{source}"
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+
+ return Output(nabu, metadata=metadata)
+''' Return results of summoning the JSON-LD SOS from a source.
+This includes the number of url in the sitemap, how many jsonLD were 'summoned'
+There may be multiple json-ld per web page, so this needs to be monitored over time.
+And how many made it into milled (this is how good the conversion at a single jsonld to RDF is.
+
+'''
+
+@asset(
+key_prefix="ingest",
+ group_name="load",
+op_tags={"ingest": "report"},
+ deps=[gleanerio_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+)
+def load_report_s3(context):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ source_name = context.asset_partition_key_for_output()
+ # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context, source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+ graphendpoint = None
+ milled = False
+ summon = True
+ returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon)
+ r = str('load repoort returned value:{}'.format(returned_value))
+ report = json.dumps(returned_value, indent=2)
+ s3Minio.putReportFile(bucket, source_name, "load_report_s3.json", report)
+ get_dagster_logger().info(f"load s3 report returned {r} ")
+ return
+
+
+''' Return results of what JSON-LD SOS is the S3 store, and compares it to the 'Named' graphs
+in the graph store. This extends the load report s3.
+This includes the number of url in the sitemap, how many jsonLD were 'summoned'
+There may be multiple json-ld per web page, so this needs to be monitored over time.
+And how many made it into milled (this is how good the conversion at a single jsonld to RDF is.
+It then compares what identifiers are in the S3 store (summon path), and the Named Graph URI's
+'''
+
+@asset(
+key_prefix="ingest",
+ group_name="load",
+op_tags={"ingest": "report"},
+ deps=[release_nabu_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+)
+def load_report_graph(context):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ gleaner_triplestore = context.resources.gleanerio.triplestore
+
+ source_name = context.asset_partition_key_for_output()
+ # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context, source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+ graphendpoint = gleaner_triplestore.GraphEndpoint(gleaner_resource.GLEANERIO_GRAPH_NAMESPACE)
+ milled = False
+ summon = True
+ returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=False) # summon false. we want the graph
+ r = str('load repoort graph returned value:{}'.format(returned_value))
+ report = json.dumps(returned_value, indent=2)
+ s3Minio.putReportFile(bucket, source_name, "load_report_graph.json", report)
+ get_dagster_logger().info(f"load report to graph returned {r} ")
+ return
+class S3ObjectInfo:
+ bucket_name=""
+ object_name=""
+@asset(group_name="load",key_prefix="ingest",
+ name="release_summarize",
+ deps=[release_nabu_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+def release_summarize(context) :
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore =context.resources.gleanerio.triplestore
+ source_name = context.asset_partition_key_for_output()
+ #source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context,source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+ # endpoint = triplestore.GraphEndpoint# getting data, not uploading data
+ #summary_namespace = _graphSummaryEndpoint()
+
+ try:
+
+ # summarydf = get_summary4repoSubset(endpoint, source_name)
+ rg = ReleaseGraph()
+ rg.read_release(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ bucket,
+ source_name,
+ options=gleaner_s3.MinioOptions())
+ summarydf = rg.summarize()
+ nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator
+ summaryttl = g.serialize(format='longturtle')
+ # Lets always write out file to s3, and insert as a separate process
+ # we might be able to make this an asset..., but would need to be acessible by http
+ # if not stored in s3
+ objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post
+ s3ObjectInfo = S3ObjectInfo()
+ s3ObjectInfo.bucket_name = bucket
+ s3ObjectInfo.object_name = objectname
+
+ bucket_name, object_name =s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo)
+ context.add_output_metadata(
+ metadata={
+ "source": source, # Metadata can be any key-value pair
+ "run": "release_summarize",
+ "bucket_name": bucket_name, # Metadata can be any key-value pair
+ "object_name": object_name,
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+ )
+ # inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle")
+ # if not inserted:
+ # raise Exception("Loading to graph failed.")
+ except Exception as e:
+ # use dagster logger
+ get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ")
+ raise Exception(f"Loading Summary graph failed. {str(e)}")
+ return 1
+
+ return
+
+@asset(group_name="load",key_prefix="ingest",
+ deps=[gleanerio_run],
+op_tags={"ingest": "report"},
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+def identifier_stats(context):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore =context.resources.gleanerio.triplestore
+ source_name = context.asset_partition_key_for_output()
+ # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context, source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+
+ returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
+ r = str('returned value:{}'.format(returned_value))
+ #r = str('identifier stats returned value:{}'.format(returned_value))
+ report = returned_value.to_json()
+ s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report)
+ get_dagster_logger().info(f"identifer stats report returned {r} ")
+ return
+
+@asset(group_name="load",key_prefix="ingest",
+ deps=[gleanerio_run],
+op_tags={"ingest": "report"},
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+def bucket_urls(context):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore =context.resources.gleanerio.triplestore
+ source_name = context.asset_partition_key_for_output()
+ # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context, source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+
+ res = s3Minio.listSummonedUrls(bucket, source_name)
+ r = str('returned value:{}'.format(res))
+ bucketurls = pd.DataFrame(res).to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC)
+ s3Minio.putReportFile(bucket, source_name, "bucketutil_urls.csv", bucketurls)
+ get_dagster_logger().info(f"bucker urls report returned {r} ")
+ return
+
+# original code. inlined.
+# def _releaseUrl( source, path=RELEASE_PATH, extension="nq"):
+# proto = "http"
+# if GLEANER_MINIO_USE_SSL:
+# proto = "https"
+# address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
+# bucket = GLEANER_MINIO_BUCKET
+# release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
+# return release_url
+@asset(group_name="load",key_prefix="ingest",
+ deps=[release_nabu_run],
+op_tags={"ingest": "report"},
+ partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"}
+ # , backfill_policy=BackfillPolicy.single_run()
+ )
+def graph_stats_report(context) :
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ source_name = context.asset_partition_key_for_output()
+ # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name)
+ source = getSource(context, source_name)
+ source_url = source.get('url')
+ s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS,
+ gleaner_s3.GLEANERIO_MINIO_PORT),
+ gleaner_s3.MinioOptions()
+ )
+ bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET
+
+ #returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"])
+ proto = "http"
+ if gleaner_s3.GLEANERIO_MINIO_USE_SSL:
+ proto = "https"
+ address = PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, gleaner_s3.GLEANERIO_MINIO_PORT)
+
+ s3FileUrl = f"{proto}://{address}/{bucket}/{RELEASE_PATH}/{source_name}_release.nq"
+ #s3FileUrl = _releaseUrl(source_name )
+ get_dagster_logger().info(f"get release for {source_name} from returned {s3FileUrl} ")
+ returned_value = generateGraphReportsRelease(source_name,s3FileUrl)
+ r = str('returned value:{}'.format(returned_value))
+ #report = json.dumps(returned_value, indent=2) # value already json.dumps
+ report = returned_value
+ s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report)
+ get_dagster_logger().info(f"graph stats returned {r} ")
+ return
+
+#might need to use this https://docs.dagster.io/_apidocs/repositories#dagster.RepositoryDefinition.get_asset_value_loader
+#@sensor(job=summon_asset_job)
+# @sensor(asset_selection=AssetSelection.keys("gleanerio_orgs"))
+# def sources_sensor(context ):
+# sources = gleanerio_orgs
+# new_sources = [
+# source
+# for source in sources
+# if not sources_partitions_def.has_partition_key(
+# source, dynamic_partitions_store=context.instance
+# )
+# ]
+#
+# return SensorResult(
+# run_requests=[
+# RunRequest(partition_key=source) for source in new_sources
+# ],
+# dynamic_partitions_requests=[
+# sources_partitions_def.build_add_request(new_sources)
+# ],
+# )
+
+# need to add a sensor to add paritions when one is added
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets
+
+
+# #########
+# CRUFT
+# worked to see if this could be a graph with an assent, and really a defiend asset job works better
+
+# ## partitioning
+# ####
+# class HarvestOpConfig(Config):
+# source_name: str
+# @dynamic_partitioned_config(partition_fn=gleanerio_orgs)
+# def harvest_config(partition_key: str):
+# return {
+# "ops":
+# {"harvest_and_release":
+# {"config": {"source_name": partition_key},
+# "ops": {
+# "gleanerio_run":
+# {"config": {"source_name": partition_key}
+# },
+# "nabu_release_run":
+# {"config": {"source_name": partition_key}
+# }
+# }
+# }
+# }
+# }
+#
+# # ops:
+# # harvest_and_release:
+# # ops:
+# # gleanerio_run:
+# # config:
+# # source_name: ""
+# # nabu_release_run:
+# # config:
+# # source_name: ""
+#
+# @graph_asset(partitions_def=sources_partitions_def)
+# #@graph_asset( )
+# def harvest_and_release() :
+# #source = context.asset_partition_key_for_output()
+# #containers = getImage()
+# #harvest = gleanerio_run(start=containers)
+# harvest = gleanerio_run()
+# release = nabu_release_run(harvest)
+# return release
+#
+# #@asset
+# # def harvest_op(context, config: HarvestOpConfig):
+# # context.log.info(config.source_name)
+# # harvest = gleanerio_run()
+# # release = nabu_release_run(harvest)
+# # return release
+#
+# # @job(config=harvest_config)
+# # def harvest_job( ):
+# # harvest_op()
+# #harvest_and_release()
+# # @schedule(cron_schedule="0 0 * * *", job=harvest_job)
+# # def geocodes_schedule():
+# # return RunRequest(partition_key="iris")
diff --git a/dagster/implnets/workflows/ingest/ingest/assets/tenant.py b/dagster/implnets/workflows/ingest/ingest/assets/tenant.py
new file mode 100644
index 00000000..5259c52a
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/assets/tenant.py
@@ -0,0 +1,185 @@
+from dagster import (
+ op, job, Config,asset,
+ In, Nothing,
+ sensor, RunRequest, RunConfig,
+ SensorEvaluationContext, asset_sensor, EventLogEntry,
+ SkipReason,
+ AssetKey,
+ static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition,
+ define_asset_job, AssetSelection,graph_asset,
+ get_dagster_logger
+)
+
+from dagster_aws.s3.sensor import get_s3_keys
+from typing import List, Dict
+from pydantic import Field
+import pydash
+
+#from pydash.collections import find
+#from pydash.predicates import is_match
+from ec.graph.manageGraph import ManageBlazegraph
+from ..assets import gleanerio_tenants, tenant_partitions_def, sources_partitions_def
+from .gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH
+
+class TenantConfig(Config):
+ source_name: str
+ name: str
+ source_list: List[str]
+ TENANT_GRAPH_NAMESPACE: str
+ TENANT_GRAPH_SUMMARY_NAMESPACE: str
+ SUMMARY_PATH: str = Field(
+ description="GLEANERIO_GRAPH_SUMMARY_PATH.", default='graphs/summary')
+ RELEASE_PATH : str = Field(
+ description="GLEANERIO_GRAPH_RELEASE_PATH.", default='graphs/latest')
+
+
+class TenantOpConfig(Config):
+ source_name: str
+
+def find_tenants_with_source(context, source_name, tenats_all):
+ get_dagster_logger().info(f" find tenant {source_name} with {tenats_all}")
+ tenants =[]
+ # tenants = pydash.collections.find(tenats_all,
+ # lambda t: p ydash.predicates.is_match(t["sources"], source_name) or pydash.predicates.is_match(t["sources"], 'all')
+ # )
+ #tenants = pydash.collections.find(tenats_all, lambda t: pydash.predicates.is_match(t["sources"], "all") )
+ for tenant in tenats_all:
+ get_dagster_logger().info(f" {tenant['community']} sources {tenant['sources']}")
+ if source_name in tenant["sources"]:
+ get_dagster_logger().info(f" found source {source_name} in {tenant['community']}")
+ tenants.append(tenant)
+ if 'all' in tenant["sources"]:
+ get_dagster_logger().info(f" found source all in {tenant['community']}")
+ tenants.append(tenant)
+ context.log.info(f" source {source_name} in {tenants}")
+ return tenants
+@asset(
+ group_name="tenant_load",key_prefix="ingest",
+op_tags={"ingest": "graph"},
+ deps=[AssetKey(["ingest","tenant_names"]), AssetKey(["ingest","tenant_all"])],
+ required_resource_keys={"gleanerio",}
+ ,partitions_def=sources_partitions_def
+)
+#def upload_release(context, config:TennantOpConfig ):
+def upload_release(context ):
+ #context.log.info(config.source_name)
+ tenants_all = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))['tenant']
+ source_name = context.asset_partition_key_for_output()
+
+ context.log.info(f"source_name {source_name}")
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ tenants = find_tenants_with_source(context, source_name, tenants_all)
+ for tenant in tenants:
+ #tenant["graph"]['main_namespace']
+ #bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, tenant["graph"]['main_namespace'])
+ try:
+ #bg.upload_nq_file()
+ namespace = tenant['graph']['main_namespace']
+ endpoint = triplestore.GraphEndpoint(namespace)
+ triplestore.post_to_graph(source_name, path=RELEASE_PATH, extension="nq", graphendpoint=endpoint)
+ context.log.info(f"load release for {source_name} to tenant {tenant['community']} {endpoint} ")
+ except Exception as ex:
+ context.log.info(f"load to tenant {source_name} failed to {endpoint} {ex}")
+ raise Exception(f"load to tenant {source_name} failed to {endpoint} {ex}")
+ return
+
+#@asset(required_resource_keys={"gleanerio",},ins={"start": In(Nothing)})
+@asset(group_name="tenant_load",key_prefix="ingest",
+op_tags={"ingest": "graph"},
+ deps=[AssetKey(["ingest","tenant_names"]), AssetKey(["ingest","tenant_all"])],
+ required_resource_keys={"gleanerio",}
+ ,partitions_def=sources_partitions_def
+ )
+#def upload_summary(context, config:TennantOpConfig):
+def upload_summary(context):
+ #context.log.info(config.source_name)
+ source_name = context.asset_partition_key_for_output()
+ context.log.info(f"tennant_name {source_name} ")
+ tenants_all = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))['tenant']
+
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ tenants = find_tenants_with_source(context,source_name, tenants_all)
+ for tenant in tenants:
+ # bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, tenant["graph"]['summary_namespace'])
+ try:
+ # bg.upload_nq_file()
+ namespace = tenant['graph']['summary_namespace']
+ endpoint = triplestore.GraphEndpoint(namespace)
+ triplestore.post_to_graph(source_name, path=SUMMARY_PATH,extension="ttl", graphendpoint=endpoint)
+ context.log.info(f"load summary for {source_name} to tenant {tenant['community']} {endpoint}")
+ except Exception as ex:
+ context.log.error(f"load to tenant failed {source_name} {endpoint} {ex}")
+ raise Exception(f"load to tenant failed {source_name} {endpoint} {ex}")
+ return
+#
+# @asset(group_name="tenant_create",required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def)
+# def create_graph_namespaces(context):
+# #context.log.info(config.source_name)
+# source_name = context.asset_partition_key_for_output()
+# context.log.info(f"tennant_name {source_name}")
+# gleaner_resource = context.resources.gleanerio
+# s3_resource = context.resources.gleanerio.gs3.s3
+# gleaner_s3 = context.resources.gleanerio.gs3
+# triplestore = context.resources.gleanerio.triplestore
+# pass
+@asset(group_name="tenant_create",key_prefix="ingest",
+ deps=[AssetKey(["ingest","tenant_all"])],
+op_tags={"ingest": "graph"},
+ required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def)
+def create_graph_namespaces(context):
+ #context.log.info(config.source_name)
+ tenant_name = context.asset_partition_key_for_output()
+ context.log.info(f"tennant_name {tenant_name}")
+ tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))
+ # from https://stackoverflow.com/questions/2361426/get-the-first-item-from-an-iterable-that-matches-a-condition
+ tenant = next((t for t in tenants["tenant"] if t['community'] == tenant_name ),None)
+ if tenant is None:
+ raise Exception("Tenant with name {} does not exist".format(tenant_name))
+ context.log.info(f"tennant {tenant}")
+ # should we put a default.
+ main_namespace = tenant["graph"]["main_namespace"]
+ summary_namespace = tenant["graph"]["summary_namespace"]
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, main_namespace )
+ bg_summary = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, summary_namespace)
+ try:
+ msg = bg.createNamespace(quads=True)
+ context.log.info(f"graph creation {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {msg}")
+ msg = bg_summary.createNamespace(quads=False)
+ context.log.info(f"graph creation {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {msg}")
+ except Exception as ex :
+ context.log.error(f"graph creation failed {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {ex}")
+ raise Exception(f"graph creation failed {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {ex}")
+ return
+
+@asset(group_name="tenant_create",key_prefix="ingest",
+ deps=[AssetKey(["ingest","tenant_all"]), AssetKey(["ingest","create_graph_namespaces"])],
+ required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def)
+def create_tenant_containers(context):
+ #context.log.info(config.source_name)
+ tenant_name = context.asset_partition_key_for_output()
+ tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))
+ context.log.info(f"tennant_name {tenant_name}")
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ pass
+#@static_partitioned_config(partition_keys=TENNANT_NAMES)
+
+ #return {"ops": {"continent_op": {"config": {"continent_name": partition_key}}}}
+#@job(config=tennant_config, partitions_def=tenant_partitions_def)
+# @job( partitions_def=tenant_partitions_def)
+# def build_community():
+# source_name = context.asset_partition_key_for_output()
+# context.log.info(f"tennant_name {source_name}")
+# upload_summary(upload_release())
diff --git a/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py b/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py
new file mode 100644
index 00000000..bd1e2c14
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py
@@ -0,0 +1,7 @@
+from dagster import job
+
+from ops.implnet_ops_SOURCEVAL import harvest_SOURCEVAL
+
+@job
+def implnet_job_SOURCEVAL():
+ harvest_SOURCEVAL()
\ No newline at end of file
diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py b/dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py
similarity index 82%
rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py
rename to dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py
index ab34084f..156f27b8 100644
--- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py
+++ b/dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py
@@ -48,7 +48,7 @@
# env items
URL = os.environ.get('PORTAINER_URL')
APIKEY = os.environ.get('PORTAINER_KEY')
-
+CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5)
GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
@@ -138,7 +138,7 @@ def s3reader(object):
get_dagster_logger().info(f"S3 read error : {str(err)}")
-def s3loader(data, name):
+def s3loader(data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")):
secure= GLEANER_MINIO_USE_SSL
server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT)
@@ -158,8 +158,8 @@ def s3loader(data, name):
# else:
# print("Bucket 'X' already exists")
- now = datetime.now()
- date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
+ # now = datetime.now()
+ # date_string = now.strftime("%Y_%m_%d_%H_%M_%S")
logname = name + '_{}.log'.format(date_string)
objPrefix = GLEANERIO_LOG_PREFIX + logname
@@ -307,7 +307,7 @@ def gleanerio(context, mode, source):
## ------------ Create
returnCode = 0
get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
-
+ date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
if str(mode) == "gleaner":
IMAGE =GLEANERIO_GLEANER_IMAGE
@@ -424,13 +424,8 @@ def gleanerio(context, mode, source):
data["Env"] = enva
data["HostConfig"] = {
"NetworkMode": GLEANER_HEADLESS_NETWORK,
- # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"]
}
- # data["Volumes"] = [
- # "dagster-project:/configs"
- # ]
- # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky
- # end setup of data
+
# docker dagster
get_dagster_logger().info(f"start docker code region: ")
@@ -468,102 +463,74 @@ def gleanerio(context, mode, source):
cid = container.id # legacy til the start get's fixed
-
- ## ------------ Archive to load, which is how to send in the config (from where?)
-
-
-
-# this method of watching the logs,
- # do not let a possible issue with container logs stop log upload.
- ## I thinkthis happens when a container exits immediately.
- try:
- for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):
- get_dagster_logger().debug(line) # noqa: T201
- except docker.errors.APIError as ex:
-
- get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}")
- except Exception as ex:
- get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ")
-
-
-
-
- # ## ------------ Wait expect 200
- # we want to get the logs, no matter what, so do not exit, yet.
- ## or should logs be moved into finally?
- ### in which case they need to be methods that don't send back errors.
- exit_status = container.wait()["StatusCode"]
- get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
- # WE PULL THE LOGS, then will throw an error
- returnCode = exit_status
-
-
-
-
- ## ------------ Copy logs expect 200
-
-
- c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
-
- # write to s3
-
- s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
- #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
- # write to minio (would need the minio info here)
-
- get_dagster_logger().info(f"container Logs to s3: ")
-
-## get log files
- url = URL + 'containers/' + cid + '/archive'
- params = {
- 'path': f"{WorkingDir}/logs"
- }
- query_string = urllib.parse.urlencode(params)
- url = url + "?" + query_string
-
- # print(url)
- req = request.Request(url, method="GET")
- req.add_header('X-API-Key', APIKEY)
- req.add_header('content-type', 'application/x-compressed')
- req.add_header('accept', 'application/json')
- r = request.urlopen(req)
-
- log.info(f"{r.status} ")
- get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}")
- # s3loader(r.read().decode('latin-1'), NAME)
- s3loader(r.read(), f"{source}_{mode}_runlogs")
- # Future, need to extraxct files, and upload
+# Removed watching the logs, in favor of periodic upload
+ wait_count = 0
+ while True:
+ wait_count += 1
+ try:
+ container.wait(timeout=CONTAINER_WAIT_TIMEOUT)
+ exit_status = container.wait()["StatusCode"]
+ get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
+ # WE PULL THE LOGS, then will throw an error
+ returnCode = exit_status
+ c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
+
+ # write to s3
+
+ s3loader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+# this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+ except requests.exceptions.ReadTimeout as ex:
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}")
+ except docker.errors.APIError as ex:
+ get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}")
+ returnCode = 1
+ break
+ if container.status == 'exited' or container.status == 'removed':
+ get_dagster_logger().info(f"Container exited or removed. status: {container.status}")
+ exit_status = container.wait()["StatusCode"]
+ returnCode = exit_status
+ s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+ # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+
+ # ABOVE Future, need to extraxct files, and upload
# pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
# pw_tar.extractall("extract_to/")
- # looks like get_archive also has issues. Returns nothing,
- # strm, stat = container.get_archive(f"{WorkingDir}/logs/")
- # get_dagster_logger().info(f"container Logs to s3: {str(stat)}")
- #
- # i =0
- # for d in strm:
- # r = d.decode('utf-8')
- # # s3loader(r.read().decode('latin-1'), NAME)
- # s3loader(r.encode(), f"{source}_{i}_runlogs")
- # i+=1
-
- # s3loader(r.read().decode('latin-1'), NAME)
if exit_status != 0:
raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
finally:
if (not DEBUG) :
- # if (cid):
- # url = URL + 'containers/' + cid
- # req = request.Request(url, method="DELETE")
- # req.add_header('X-API-Key', APIKEY)
- # # req.add_header('content-type', 'application/json')
- # req.add_header('accept', 'application/json')
- # r = request.urlopen(req)
- # print(r.status)
- # get_dagster_logger().info(f"Container Remove: {str(r.status)}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
if (service):
service.remove()
get_dagster_logger().info(f"Service Remove: {service.name}")
@@ -572,14 +539,7 @@ def gleanerio(context, mode, source):
else:
get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
- # if (container):
- # container.remove(force=True)
- # get_dagster_logger().info(f"Container Remove: {container.name}")
- # else:
- # get_dagster_logger().info(f"Container Not created, so not removed.")
- #
- # else:
- # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED")
+
if (returnCode != 0):
get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
@@ -587,7 +547,7 @@ def gleanerio(context, mode, source):
return returnCode
@op
-def r2r_getImage(context):
+def SOURCEVAL_getImage(context):
run_container_context = DockerContainerContext.create_for_run(
context.dagster_run,
context.instance.run_launcher
@@ -599,54 +559,54 @@ def r2r_getImage(context):
client.images.pull(GLEANERIO_GLEANER_IMAGE)
client.images.pull(GLEANERIO_NABU_IMAGE)
@op(ins={"start": In(Nothing)})
-def r2r_gleaner(context):
- returned_value = gleanerio(context, ("gleaner"), "r2r")
+def SOURCEVAL_gleaner(context):
+ returned_value = gleanerio(context, ("gleaner"), "SOURCEVAL")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"Gleaner returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_nabu_prune(context):
- returned_value = gleanerio(context,("prune"), "r2r")
+def SOURCEVAL_nabu_prune(context):
+ returned_value = gleanerio(context,("prune"), "SOURCEVAL")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu prune returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_nabuprov(context):
- returned_value = gleanerio(context,("prov"), "r2r")
+def SOURCEVAL_nabuprov(context):
+ returned_value = gleanerio(context,("prov"), "SOURCEVAL")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu prov returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_nabuorg(context):
- returned_value = gleanerio(context,("orgs"), "r2r")
+def SOURCEVAL_nabuorg(context):
+ returned_value = gleanerio(context,("orgs"), "SOURCEVAL")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu org load returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_naburelease(context):
- returned_value = gleanerio(context,("release"), "r2r")
+def SOURCEVAL_naburelease(context):
+ returned_value = gleanerio(context,("release"), "SOURCEVAL")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"nabu release returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_uploadrelease(context):
- returned_value = post_to_graph("r2r", extension="nq")
+def SOURCEVAL_uploadrelease(context):
+ returned_value = post_to_graph("SOURCEVAL", extension="nq")
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"upload release returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_missingreport_s3(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r")
+def SOURCEVAL_missingreport_s3(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL")
source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
graphendpoint = None
milled = False
summon = True
@@ -657,12 +617,12 @@ def r2r_missingreport_s3(context):
get_dagster_logger().info(f"missing s3 report returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_missingreport_graph(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r")
+def SOURCEVAL_missingreport_graph(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL")
source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
@@ -676,12 +636,12 @@ def r2r_missingreport_graph(context):
get_dagster_logger().info(f"missing graph report returned {r} ")
return
@op(ins={"start": In(Nothing)})
-def r2r_graph_reports(context) :
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r")
+def SOURCEVAL_graph_reports(context) :
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL")
#source_url = source.get('url')
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql"
@@ -696,11 +656,11 @@ def r2r_graph_reports(context) :
return
@op(ins={"start": In(Nothing)})
-def r2r_identifier_stats(context):
- source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r")
+def SOURCEVAL_identifier_stats(context):
+ source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL")
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
returned_value = generateIdentifierRepo(source_name, bucket, s3Minio)
r = str('returned value:{}'.format(returned_value))
@@ -711,10 +671,10 @@ def r2r_identifier_stats(context):
return
@op(ins={"start": In(Nothing)})
-def r2r_bucket_urls(context):
+def SOURCEVAL_bucket_urls(context):
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
res = s3Minio.listSummonedUrls(bucket, source_name)
r = str('returned value:{}'.format(res))
@@ -728,10 +688,10 @@ class S3ObjectInfo:
object_name=""
@op(ins={"start": In(Nothing)})
-def r2r_summarize(context) :
+def SOURCEVAL_summarize(context) :
s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
bucket = GLEANER_MINIO_BUCKET
- source_name = "r2r"
+ source_name = "SOURCEVAL"
endpoint = _graphEndpoint() # getting data, not uploading data
summary_namespace = _graphSummaryEndpoint()
@@ -762,20 +722,20 @@ def r2r_summarize(context) :
return
@op(ins={"start": In(Nothing)})
-def r2r_upload_summarize(context):
- returned_value = post_to_graph("r2r",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
+def SOURCEVAL_upload_summarize(context):
+ returned_value = post_to_graph("SOURCEVAL",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint())
r = str('returned value:{}'.format(returned_value))
get_dagster_logger().info(f"upload summary returned {r} ")
return
#Can we simplify and use just a method. Then import these methods?
-# def missingreport_s3(context, msg: str, source="r2r"):
+# def missingreport_s3(context, msg: str, source="SOURCEVAL"):
#
# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source)
# source_url = source.get('url')
# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS)
# bucket = GLEANER_MINIO_BUCKET
-# source_name="r2r"
+# source_name="SOURCEVAL"
#
# graphendpoint = None
# milled = False
@@ -784,32 +744,32 @@ def r2r_upload_summarize(context):
# r = str('returned value:{}'.format(returned_value))
# return msg + r
@graph
-def harvest_r2r():
- containers = r2r_getImage()
- harvest = r2r_gleaner(start=containers)
+def harvest_SOURCEVAL():
+ containers = SOURCEVAL_getImage()
+ harvest = SOURCEVAL_gleaner(start=containers)
# defingin nothing dependencies
# https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies
- report_ms3 = r2r_missingreport_s3(start=harvest)
- report_idstat = r2r_identifier_stats(start=report_ms3)
+ report_ms3 = SOURCEVAL_missingreport_s3(start=harvest)
+ report_idstat = SOURCEVAL_identifier_stats(start=report_ms3)
# for some reason, this causes a msg parameter missing
- report_bucketurl = r2r_bucket_urls(start=report_idstat)
+ report_bucketurl = SOURCEVAL_bucket_urls(start=report_idstat)
- #report1 = missingreport_s3(harvest, source="r2r")
- load_release = r2r_naburelease(start=harvest)
- load_uploadrelease = r2r_uploadrelease(start=load_release)
+ #report1 = missingreport_s3(harvest, source="SOURCEVAL")
+ load_release = SOURCEVAL_naburelease(start=harvest)
+ load_uploadrelease = SOURCEVAL_uploadrelease(start=load_release)
- load_prune = r2r_nabu_prune(start=load_uploadrelease)
- load_prov = r2r_nabuprov(start=load_prune)
- load_org = r2r_nabuorg(start=load_prov)
+ load_prune = SOURCEVAL_nabu_prune(start=load_uploadrelease)
+ load_prov = SOURCEVAL_nabuprov(start=load_prune)
+ load_org = SOURCEVAL_nabuorg(start=load_prov)
- summarize = r2r_summarize(start=load_uploadrelease)
- upload_summarize = r2r_upload_summarize(start=summarize)
+ summarize = SOURCEVAL_summarize(start=load_uploadrelease)
+ upload_summarize = SOURCEVAL_upload_summarize(start=summarize)
# run after load
- report_msgraph = r2r_missingreport_graph(start=summarize)
- report_graph = r2r_graph_reports(start=report_msgraph)
+ report_msgraph = SOURCEVAL_missingreport_graph(start=summarize)
+ report_graph = SOURCEVAL_graph_reports(start=report_msgraph)
diff --git a/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py b/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py
new file mode 100644
index 00000000..232980c5
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py
@@ -0,0 +1,8 @@
+from dagster import schedule
+
+from jobs.implnet_jobs_SOURCEVAL import implnet_job_SOURCEVAL
+
+@schedule(cron_schedule="0 24 * * *", job=implnet_job_SOURCEVAL, execution_timezone="US/Central")
+def implnet_sch_SOURCEVAL(_context):
+ run_config = {}
+ return run_config
diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py b/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py
new file mode 100644
index 00000000..f1a7cbeb
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py
@@ -0,0 +1,5 @@
+from .summon_assets import summon_asset_job, sources_asset_job
+
+from .tenant_load import tenant_asset_job, tenant_namespaces_job, release_asset_job
+
+from .summon_assets import sources_partitions_def
diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py
new file mode 100644
index 00000000..6901701a
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py
@@ -0,0 +1,29 @@
+from dagster import (
+ asset, Config, Output,AssetKey,
+ define_asset_job, AssetSelection,
+get_dagster_logger,
+)
+
+from ..assets.gleaner_summon_assets import *
+from ..assets.gleaner_sources import sources_partitions_def, gleanerio_sources
+
+# disabling load_graph report until we can move it to tenant build runs.
+summon_asset_job = define_asset_job(
+ name="summon_and_release_job",
+ selection=AssetSelection.assets(validate_sitemap_url, gleanerio_run, release_nabu_run, load_report_s3,
+ release_summarize, identifier_stats, bucket_urls,
+ graph_stats_report,
+ #load_report_graph
+ ),
+ partitions_def=sources_partitions_def,
+ #tags={"dagster/concurrency_key": 'ingest'},
+tags={"ingest": 'docker'},
+)
+# so can use command line to limit: https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#limiting-opasset-concurrency-across-runs
+# value is ingest
+sources_asset_job = define_asset_job(
+ name="sources_config_updated_job",
+ selection=AssetSelection.assets(AssetKey(["ingest","sources_names_active"])).required_multi_asset_neighbors(),
+ partitions_def=sources_partitions_def,
+ tags={"dagster/priority": "11"}
+)
diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py b/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py
new file mode 100644
index 00000000..dab6d1ee
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py
@@ -0,0 +1,100 @@
+from dagster import (
+ op, job, Config,
+ sensor, RunRequest, RunConfig,
+ SensorEvaluationContext, asset_sensor, EventLogEntry,
+ SkipReason,
+ AssetKey,
+ static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition,
+ define_asset_job, AssetSelection,graph_asset,
+BackfillPolicy
+)
+
+from dagster_aws.s3.sensor import get_s3_keys
+from typing import List, Dict
+from pydantic import Field
+
+from ..assets import gleanerio_tenants, tenant_partitions_def, sources_partitions_def, upload_release,upload_summary
+from ..assets.tenant import create_tenant_containers, create_graph_namespaces
+from ..resources.gleanerio import GleanerioResource
+from ..resources.gleanerS3 import gleanerS3Resource
+from ..resources.graph import BlazegraphResource
+
+
+
+
+tenant_asset_job = define_asset_job(
+ name="tenant_config_updated_job",
+ selection=AssetSelection.assets(AssetKey(["ingest","tenant_names"])).required_multi_asset_neighbors(),
+ partitions_def=sources_partitions_def,
+ tags={"dagster/priority": "10"}
+)
+
+release_asset_job = define_asset_job(
+ name="tenant_release_job",
+ selection=AssetSelection.assets(upload_release,upload_summary),
+ partitions_def=sources_partitions_def,
+ tags={"dagster/priority": "3", "ingest": "graph"}
+ # tags={"dagster/concurrency_key": 'graph'},
+)
+#Attempted to set tag with reserved system prefix: dagster/concurrency_key
+#File "/usr/local/lib/python3.11/site-packages/dagster/_daemon/sensor.py", line 471, in _process_tick_generator
+
+tenant_namespaces_job = define_asset_job(
+ name="tenant_namespaces_job",
+ selection=AssetSelection.assets(create_tenant_containers, create_graph_namespaces),
+ partitions_def=tenant_partitions_def,
+ tags={"dagster/priority": "20"}
+)
+
+# @job(partitions_def=tenant_partitions_def)
+# def tenant_namespaces_job(context):
+# source_name = context.asset_partition_key_for_output()
+# context.log.info(f"tenant_name {source_name}")
+# create_tenant_containers(create_graph_namespaces())
+
+
+class TenantConfig(Config):
+ source_name: str
+ name: str
+ source_list: List[str]
+ TENANT_GRAPH_NAMESPACE: str
+ TENANT_GRAPH_SUMMARY_NAMESPACE: str
+ SUMMARY_PATH: str = Field(
+ description="GLEANERIO_GRAPH_SUMMARY_PATH.", default='graphs/summary')
+ RELEASE_PATH : str = Field(
+ description="GLEANERIO_GRAPH_RELEASE_PATH.", default='graphs/latest')
+@dynamic_partitioned_config(partition_fn=gleanerio_tenants)
+def tenant_config(partition_key: str):
+
+ # default_config ={"ops": {
+ # "upload_release":
+ # {"config":
+ # {
+ # TenantConfig(
+ # source_name=partition_key,
+ # name="name",
+ # source_list=[],
+ # TENANT_GRAPH_NAMESPACE="",
+ # TENANT_GRAPH_SUMMARY_NAMESPACE=""
+ # )
+ # }
+ # }
+ # },
+ # "upload_summary":
+ # {"config":
+ # {
+ # TenantConfig(
+ # source_name=partition_key,
+ # name="name",
+ # source_list=[],
+ # TENANT_GRAPH_NAMESPACE="",
+ # TENANT_GRAPH_SUMMARY_NAMESPACE=""
+ # )
+ # }
+ # }
+ # }
+ default_config = {"ops": {
+ {"upload_release": {"config": {"source_name": partition_key}}},
+ {"upload_summary": {"config": {"source_name": partition_key}}}
+ }}
+ return default_config
diff --git a/dagster/implnets/workflows/ingest/ingest/resources/README.md b/dagster/implnets/workflows/ingest/ingest/resources/README.md
new file mode 100644
index 00000000..240a76e0
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/resources/README.md
@@ -0,0 +1,5 @@
+
+Let's try to use dasgeter aws as the minio configuration
+
+how to handel cacading configs
+https://docs.dagster.io/concepts/resources#resources-that-depend-on-other-resources
diff --git a/dagster/implnets/workflows/ingest/ingest/resources/__init__.py b/dagster/implnets/workflows/ingest/ingest/resources/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py
new file mode 100644
index 00000000..d16a8abb
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py
@@ -0,0 +1,87 @@
+from dagster import asset, get_dagster_logger, define_asset_job, ConfigurableResource
+from dagster_aws.s3 import S3Resource
+
+#from dagster import Field
+from pydantic import Field
+
+from ..utils import PythonMinioAddress
+
+
+class gleanerS3Resource(ConfigurableResource):
+ # this should be s3, since it is the s3 resource. Others at gleaner s3 resources
+ s3: S3Resource
+ GLEANERIO_MINIO_BUCKET: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_ADDRESS: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_PORT: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_USE_SSL: bool= Field(
+ default=False)
+ GLEANERIO_CONFIG_PATH : str = Field(
+ description="GLEANERIO_CONFIG_PATH.", default="scheduler/configs/")
+ GLEANERIO_TENANT_FILENAME : str = Field(
+ description="GLEANERIO_TENANT_FILENAME.", default="tenant.yaml")
+ GLEANERIO_SOURCES_FILENAME: str = Field(
+ description="GLEANERIO_SOURCES_FILENAME.", default="gleanerconfig.yaml")
+ # now using the boto s3 embedded in dagster_aws, but just in case we need them
+ GLEANERIO_MINIO_ACCESS_KEY: str = Field(
+ description="GLEANERIO_MINIO_ACCESS_KEY")
+ GLEANERIO_MINIO_SECRET_KEY: str = Field(
+ description="GLEANERIO_MINIO_SECRET_KEY")
+
+ ## https://docs.dagster.io/_apidocs/libraries/dagster-a
+# Courtesy method for the ec utilities
+ def MinioOptions(self):
+ return {"secure": self.s3.use_ssl
+
+ , "access_key": self.s3.aws_access_key_id
+ , "secret_key": self.s3.aws_secret_access_key
+ }
+## https://docs.dagster.io/_apidocs/libraries/dagster-aws#s3
+# fields from dagster_aws.s3.S3Resource
+# region_name
+# endpoint_url
+# use_ssl
+# aws_access_key_id
+# aws_secret_access_key
+ def listPath(self, path='orgs', recusrsive=True):
+ result = self.s3.get_client().list_objects(
+ Bucket=self.GLEANERIO_MINIO_BUCKET,
+ Prefix=path,
+# Recusrsive=recusrsive
+ )
+ return result["Contents"]
+ def getFile(self, path='test'):
+ try:
+ result = self.s3.get_client().get_object(
+ Bucket=self.GLEANERIO_MINIO_BUCKET,
+ Key=path,
+ )
+ get_dagster_logger().info(
+ f"file {result['Body']}" )
+ return result["Body"]
+ except Exception as ex:
+ get_dagster_logger().info(f"file {path} not found in {self.GLEANERIO_MINIO_BUCKET} at {self.s3.endpoint_url} {ex}")
+ def getTennatFile(self, path=''):
+ if path == '':
+ path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_TENANT_FILENAME}"
+ try:
+ get_dagster_logger().info(f"tenant_path {path} ")
+ return self.getFile( path=path)
+
+ except Exception as ex:
+ get_dagster_logger().info(f"tenant {path} not found ")
+ #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT)
+
+ # this will change to use just a sources.
+ def getSourcesFile(self, path=''):
+ if path == '':
+ path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_SOURCES_FILENAME}"
+ try:
+ get_dagster_logger().info(f"sources_path {path} ")
+ return self.getFile( path=path)
+
+ except Exception as ex:
+ get_dagster_logger().info(f"sources_path {path} not found ")
+ #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT)
diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py
new file mode 100644
index 00000000..937d3c0d
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py
@@ -0,0 +1,497 @@
+import io
+import os
+from typing import Any, Mapping, Optional, Sequence
+
+#from dagster import Field
+from pydantic import Field
+
+import pydash
+from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger
+
+
+
+import time
+from datetime import datetime
+import requests
+
+import docker
+from docker.types import RestartPolicy, ServiceMode
+
+from dagster import In, Nothing, OpExecutionContext, StringSource, op
+
+from dagster._core.utils import parse_env_var
+
+
+from dagster_docker.container_context import DockerContainerContext
+from dagster_docker.docker_run_launcher import DockerRunLauncher
+from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image
+from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference
+
+from .graph import GraphResource,BlazegraphResource
+from .gleanerS3 import gleanerS3Resource
+
+#Let's try to use dasgeter aws as the minio configuration
+
+#
+# # Vars and Envs
+# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
+# # env items
+# URL = os.environ.get('PORTAINER_URL')
+# APIKEY = os.environ.get('PORTAINER_KEY')
+# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5)
+#
+# Let's try to use dasgeter aws as the minio configuration
+# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
+# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
+# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
+# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
+# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
+# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+#
+# # set for the earhtcube utiltiies
+# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
+#
+# ,"access_key": GLEANER_MINIO_ACCESS_KEY
+# ,"secret_key": GLEANER_MINIO_SECRET_KEY
+# }
+#
+# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
+# # using GLEANER, even though this is a nabu property... same prefix seems easier
+# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
+# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
+# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
+# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
+# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
+# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
+# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
+# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
+# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
+# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
+# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
+# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
+# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+#
+# SUMMARY_PATH = 'graphs/summary'
+# RELEASE_PATH = 'graphs/latest'
+
+# this will probably need to handle the client, and the
+class GleanerioResource(ConfigurableResource):
+
+ DEBUG_CONTAINER: bool
+ # docker/portainer API
+ GLEANERIO_DOCKER_URL: str = Field(
+ description="Docker Endpoint URL.")
+ GLEANERIO_PORTAINER_APIKEY: str = Field(
+ description="Portainer API Key.")
+ # Dokcerhub container images
+ GLEANERIO_GLEANER_IMAGE: str = Field(
+ description="GLEANERIO_GLEANER_IMAGE.")
+ GLEANERIO_NABU_IMAGE: str = Field(
+ description="GLEANERIO_NABU_IMAGE.")
+
+ # docker swarm resources. Presently a network and config names
+ GLEANERIO_DOCKER_HEADLESS_NETWORK: str = Field(
+ description="GLEANERIO_HEADLESS_NETWORK.")
+ GLEANERIO_DOCKER_GLEANER_CONFIG: str = Field(
+ description="GLEANERIO_DOCKER_GLEANER_CONFIG.")
+ GLEANERIO_DOCKER_NABU_CONFIG: str = Field(
+ description="GLEANERIO_DOCKER_NABU_CONFIG.")
+
+ GLEANERIO_HEADLESS_ENDPOINT:str = Field(
+ description="GLEANERIO_HEADLESS_NETWORK.", default="http://headless:9000/")
+
+# location where config file will be mounted in container
+ GLEANERIO_GLEANER_CONFIG_PATH: str = Field(
+ description="GLEANERIO_DOCKER_GLEANER_CONFIG_PATH.")
+
+ GLEANERIO_NABU_CONFIG_PATH: str = Field(
+ description="GLEANERIO_DOCKER_NABU_CONFIG_PATH.")
+
+# Execution parameter. The logs from LOG_PREFIX will be uploaded to s3 every n seconds.
+ GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT: int = Field(
+ description="CONTAINER_WAIT_TIMEOUT.", default=600)
+ GLEANERIO_LOG_PREFIX: str = Field(
+ description="GLEANERIO_DOCKER_LOG_PREFIX.")
+
+ GLEANERIO_DAGSTER_CONFIG_PATH: str = Field(
+ description="DAGSTER_GLEANERIO_CONFIG_PATH for Project.")
+ gs3: gleanerS3Resource # this will be a botocore.client.S3.
+ triplestore: GraphResource # should be a blazegraph... but let's try generic
+ GLEANERIO_GRAPH_NAMESPACE:str = Field(
+ description="GLEANERIO_GRAPH_NAMESPACE for Project.")
+ GLEANERIO_GRAPH_SUMMARY_NAMESPACE:str = Field(
+ description="GLEANERIO_GRAPH_SUMMARY_NAMESPACE for Project.")
+
+ # at present, these are hard coded as os.getenv in sensors.gleaner_summon.sources_schedule
+ GLEANERIO_SCHEDULE_DEFAULT :str = Field(
+ description="GLEANERIO_SCHEDULE_DEFAULT for Project.", default="@weekly")
+ GLEANERIO_SCHEDULE_DEFAULT_TIMEZONE :str = Field(
+ description="GLEANERIO_SCHEDULE_DEFAULT_TIMEZONE for Project.", default="America/Los_Angeles")
+
+ def _get_client(self, docker_container_context: DockerContainerContext):
+ headers = {'X-API-Key': self.GLEANERIO_PORTAINER_APIKEY}
+ client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.43")
+ # client = docker.APIClient(base_url=URL, version="1.35")
+ get_dagster_logger().info(f"create docker client")
+ if (client.api._general_configs):
+ client.api._general_configs["HttpHeaders"] = headers
+ else:
+ client.api._general_configs = {"HttpHeaders": headers}
+ client.api.headers['X-API-Key'] = self.GLEANERIO_PORTAINER_APIKEY
+ get_dagster_logger().info(f" docker version {client.version()}")
+ if docker_container_context.registry:
+ client.login(
+ registry=docker_container_context.registry["url"],
+ username=docker_container_context.registry["username"],
+ password=docker_container_context.registry["password"],
+ )
+ return client
+
+ def _create_service(self,
+ op_context: OpExecutionContext,
+ client,
+ container_context: DockerContainerContext,
+ image: str,
+ entrypoint: Optional[Sequence[str]],
+ command: Optional[Sequence[str]],
+ name="",
+ workingdir="/",
+
+ ):
+ env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])
+ get_dagster_logger().info(f"create docker service for {name}")
+ ## thoguhts
+ # return service, container, since there is one
+ restart_policy = RestartPolicy(condition='none')
+ # docker.py if replicated job, total completions = replicas
+ # replicas =0 you do not get a container
+ serivce_mode = ServiceMode("replicated-job", concurrency=1, replicas=1)
+ get_dagster_logger().info(str(client.configs.list()))
+ # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}})
+ gleanerconfig = client.configs.list(filters={"name": [self.GLEANERIO_DOCKER_GLEANER_CONFIG]})
+ if gleanerconfig is not None and len(gleanerconfig ) >0:
+ get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}")
+ else:
+ raise Exception(f"docker config '{self.GLEANERIO_DOCKER_GLEANER_CONFIG}' not found. Please add Gleaner/Nabu configuration files to docker.")
+ nabuconfig = client.configs.list(filters={"name": [self.GLEANERIO_DOCKER_NABU_CONFIG]})
+ if nabuconfig is not None and len(nabuconfig) >0 :
+ get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}")
+ else:
+ raise Exception(f"docker config '{self.GLEANERIO_DOCKER_NABU_CONFIG}' not found. Please add Gleaner/Nabu configuration files to docker.")
+ get_dagster_logger().info(f"create docker service for {name}")
+ gleaner = ConfigReference(gleanerconfig[0].id, self.GLEANERIO_DOCKER_GLEANER_CONFIG, self.GLEANERIO_GLEANER_CONFIG_PATH)
+ nabu = ConfigReference(nabuconfig[0].id, self.GLEANERIO_DOCKER_NABU_CONFIG, self.GLEANERIO_NABU_CONFIG_PATH)
+ configs = [gleaner, nabu]
+ # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),
+ service = client.services.create(
+ image,
+ args=command,
+ env=env_vars,
+ name=name,
+ networks=container_context.networks if len(container_context.networks) else None,
+ restart_policy=restart_policy,
+ mode=serivce_mode,
+ workdir=workingdir,
+ configs=configs
+ )
+ wait_count = 0
+ while True:
+ time.sleep(1)
+ wait_count += 1
+ get_dagster_logger().debug(str(service.tasks()))
+
+ container_task = service.tasks(filters={"service": name})
+
+ containers = client.containers.list(all=True, filters={"label": f"com.docker.swarm.service.name={name}"})
+ if len(containers) > 0:
+ break
+ if wait_count > 12:
+ raise f"Container for service {name} not starting"
+
+ get_dagster_logger().info(len(containers))
+ return service, containers[0]
+
+ def getImage(self,context):
+ run_container_context = DockerContainerContext.create_for_run(
+ context.dagster_run,
+ context.instance.run_launcher
+ if isinstance(context.instance.run_launcher, DockerRunLauncher)
+ else None,
+ )
+ get_dagster_logger().info(f"call docker _get_client: ")
+ client = self.get_client(run_container_context)
+ client.images.pull(self.GLEANERIO_GLEANER_IMAGE)
+ client.images.pull(self.GLEANERIO_NABU_IMAGE)
+
+ def s3loader(self,data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")):
+ logname = name + '_{}.log'.format(date_string)
+ objPrefix = self.GLEANERIO_LOG_PREFIX + logname
+ f = io.BytesIO()
+ # length = f.write(bytes(json_str, 'utf-8'))
+ length = f.write(data)
+ f.seek(0)
+ self.gs3.s3.get_client().put_object(Bucket=self.gs3.GLEANERIO_MINIO_BUCKET,
+ Key=objPrefix,
+ Body=f, # io.BytesIO(data),
+ ContentLength=length, # len(data),
+ ContentType="text/plain"
+ )
+ get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}")
+# rewrite so that we pass in the image, args, name working dir.
+ # we want to setup 'sensors' for when assets are returned by these
+ # data -> returns summon directory, and a release file.
+
+ def execute(self,context, mode, source):
+ ## ------------ Create
+ returnCode = 0
+ get_dagster_logger().info(f"Gleanerio mode: {str(mode)}")
+ date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+ if str(mode) == "gleaner":
+ IMAGE =self.GLEANERIO_GLEANER_IMAGE
+
+ # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude"
+ ARGS = ["--cfg", self.GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"]
+ NAME = f"sch_{source}_{str(mode)}"
+ WorkingDir = "/gleaner/"
+ #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"]
+ # LOGFILE = 'log_gleaner.txt' # only used for local log file writing
+ elif (str(mode) == "prune"):
+ IMAGE = self.GLEANERIO_NABU_IMAGE
+
+ ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source]
+ NAME = f"sch_{source}_{str(mode)}"
+ WorkingDir = "/nabu/"
+ Entrypoint = "nabu"
+ # LOGFILE = 'log_nabu.txt' # only used for local log file writing
+ elif (str(mode) == "prov"):
+ IMAGE = self.GLEANERIO_NABU_IMAGE
+
+ ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source]
+ NAME = f"sch_{source}_{str(mode)}"
+ WorkingDir = "/nabu/"
+ Entrypoint = "nabu"
+ # LOGFILE = 'log_nabu.txt' # only used for local log file writing
+ elif (str(mode) == "orgs"):
+ IMAGE = self.GLEANERIO_NABU_IMAGE
+
+ ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"]
+ NAME = f"sch_{source}_{str(mode)}"
+ WorkingDir = "/nabu/"
+ Entrypoint = "nabu"
+ # LOGFILE = 'log_nabu.txt' # only used for local log file writing
+ elif (str(mode) == "release"):
+ IMAGE = self.GLEANERIO_NABU_IMAGE
+
+ ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source]
+ NAME = f"sch_{source}_{str(mode)}"
+ WorkingDir = "/nabu/"
+ Entrypoint = "nabu"
+ # LOGFILE = 'log_nabu.txt' # only used for local log file writing
+ else:
+
+ returnCode = 1
+ return returnCode
+
+ # from docker0dagster
+ run_container_context = DockerContainerContext.create_for_run(
+ context.dagster_run,
+ context.instance.run_launcher
+ if isinstance(context.instance.run_launcher, DockerRunLauncher)
+ else None,
+ )
+ validate_docker_image(IMAGE)
+
+ try:
+ # setup data/body for container create
+ data = {}
+ data["Image"] = IMAGE
+ data["WorkingDir"] = WorkingDir
+ #data["Entrypoint"] = Entrypoint
+ data["Cmd"] = ARGS
+ #### gleaner
+ # v.BindEnv("minio.address", "MINIO_ADDRESS")
+ # v.BindEnv("minio.port", "MINIO_PORT")
+ # v.BindEnv("minio.ssl", "MINIO_USE_SSL")
+ # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY")
+ # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY")
+ # v.BindEnv("minio.bucket", "MINIO_BUCKET")
+ # // v.BindEnv("minio.region", "MINIO_REGION")
+ # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT")
+ # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE")
+ # v.BindEnv("sparql.username", "SPARQL_USERNAME")
+ # v.BindEnv("sparql.password", "SPARQL_PASSWORD")
+ # v.BindEnv("s3.domain", "S3_DOMAIN")
+ ### gleaner summoner config
+ # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT")
+ # viperSubtree.BindEnv("threads", "GLEANER_THREADS")
+ # viperSubtree.BindEnv("mode", "GLEANER_MODE")
+
+ #### NABU config
+ # minioSubtress.BindEnv("address", "MINIO_ADDRESS")
+ # minioSubtress.BindEnv("port", "MINIO_PORT")
+ # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
+ # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
+ # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
+ # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
+ # minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
+ # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT")
+ ###### nabu sparql config
+ # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK")
+ # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD")
+ # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE")
+ # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE")
+ # viperSubtree.BindEnv("username", "SPARQL_USERNAME")
+ # viperSubtree.BindEnv("password", "SPARQL_PASSWORD")
+ ### NABU object
+ # viperSubtree.BindEnv("bucket", "MINIO_BUCKET")
+ # viperSubtree.BindEnv("domain", "S3_DOMAIN")
+ # add in env variables here"Env": ["FOO=bar","BAZ=quux"],
+
+ # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE
+ enva = []
+ enva.append(str("MINIO_ADDRESS={}".format(self.gs3.GLEANERIO_MINIO_ADDRESS))) # the python needs to be wrapped, this does not
+ enva.append(str("MINIO_PORT={}".format(self.gs3.GLEANERIO_MINIO_PORT)))
+ #enva.append(str("MINIO_USE_SSL={}".format(self.gs3.GLEANER_MINIO_USE_SSL)))
+ enva.append(str("MINIO_USE_SSL={}".format(self.gs3.s3.use_ssl)))
+ #enva.append(str("MINIO_SECRET_KEY={}".format(self.gs3.GLEANER_MINIO_SECRET_KEY)))
+ #enva.append(str("MINIO_ACCESS_KEY={}".format(self.gs3.GLEANER_MINIO_ACCESS_KEY)))
+ enva.append(str("MINIO_SECRET_KEY={}".format(self.gs3.s3.aws_secret_access_key)))
+ enva.append(str("MINIO_ACCESS_KEY={}".format(self.gs3.s3.aws_access_key_id)))
+ #enva.append(str("MINIO_BUCKET={}".format(self.gs3.GLEANER_MINIO_BUCKET)))
+ enva.append(str("MINIO_BUCKET={}".format(self.gs3.GLEANERIO_MINIO_BUCKET)))
+ enva.append(str("SPARQL_ENDPOINT={}".format(self.triplestore.GraphEndpoint(self.GLEANERIO_GRAPH_NAMESPACE))))
+ enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(self.GLEANERIO_HEADLESS_ENDPOINT)))
+ enva.append(str("GLEANERIO_DOCKER_HEADLESS_NETWORK={}".format(self.GLEANERIO_DOCKER_HEADLESS_NETWORK)))
+
+ data["Env"] = enva
+ data["HostConfig"] = {
+ "NetworkMode": self.GLEANERIO_DOCKER_HEADLESS_NETWORK,
+ }
+
+
+ # docker dagster
+ get_dagster_logger().info(f"start docker code region: ")
+
+
+ # trying to get headers in:
+ # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45
+ op_container_context = DockerContainerContext(
+ # registry=registry,
+ env_vars=enva,
+ networks=[self.GLEANERIO_DOCKER_HEADLESS_NETWORK],
+ container_kwargs={"working_dir": data["WorkingDir"],
+ # "volumes": {
+ # f"{GLEANER_CONFIG_VOLUME}":
+ # {'bind': '/configs', 'mode': 'rw'}
+ # },
+
+
+ },
+ )
+ container_context = run_container_context.merge(op_container_context)
+ get_dagster_logger().info(f"call docker _get_client: ")
+ client = self._get_client(container_context)
+
+ try:
+ get_dagster_logger().info(f"try docker _create_service: ")
+ service, container = self._create_service(
+ context, client, container_context, IMAGE, "", data["Cmd"], name=NAME,
+ workingdir=data["WorkingDir"]
+ )
+ except Exception as err:
+ raise err
+
+
+ cid = container.id # legacy til the start get's fixed
+
+
+ # Removed watching the logs, in favor of periodic upload
+ wait_count = 0
+ while True:
+ wait_count += 1
+ try:
+ container.wait(timeout=self.GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT)
+ exit_status = container.wait()["StatusCode"]
+ get_dagster_logger().info(f"Container Wait Exit status: {exit_status}")
+ # WE PULL THE LOGS, then will throw an error
+ returnCode = exit_status
+ c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1')
+
+ # write to s3
+ # use minio_resource
+
+ self.s3loader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object
+
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+ # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ # use minio_resource
+ self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+ except requests.exceptions.ReadTimeout as ex:
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ # use minio_resource
+ self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}")
+ except docker.errors.APIError as ex:
+ get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}")
+ returnCode = 1
+ break
+ if container.status == 'exited' or container.status == 'removed':
+ get_dagster_logger().info(f"Container exited or removed. status: {container.status}")
+ exit_status = container.wait()["StatusCode"]
+ returnCode = exit_status
+ # use minio_resource
+ self.s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object
+ # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object
+ # write to minio (would need the minio info here)
+
+ get_dagster_logger().info(f"container Logs to s3: ")
+ # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder
+ path = f"{WorkingDir}/logs"
+ tar_archive_stream, tar_stat = container.get_archive(path)
+ archive = bytearray()
+ for chunk in tar_archive_stream:
+ archive.extend(chunk)
+ # use minio_resource
+ self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string)
+ get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}")
+ break
+
+ # ABOVE Future, need to extraxct files, and upload
+ # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8')))
+ # pw_tar.extractall("extract_to/")
+
+
+ if exit_status != 0:
+ raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}")
+ finally:
+ if (not self.DEBUG_CONTAINER) :
+ if (service):
+ service.remove()
+ get_dagster_logger().info(f"Service Remove: {service.name}")
+ else:
+ get_dagster_logger().info(f"Service Not created, so not removed.")
+
+ else:
+ get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED")
+
+
+ if (returnCode != 0):
+ get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3")
+ raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3")
+ return returnCode
diff --git a/dagster/implnets/workflows/ingest/ingest/resources/graph.py b/dagster/implnets/workflows/ingest/ingest/resources/graph.py
new file mode 100644
index 00000000..e2636be3
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/resources/graph.py
@@ -0,0 +1,128 @@
+import os
+from typing import Any, Dict
+
+import pydash
+from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger
+
+#from dagster import Field
+from pydantic import Field
+import requests
+
+from .gleanerS3 import gleanerS3Resource
+#Let's try to use dasgeter aws as the minio configuration
+from ..utils import PythonMinioAddress
+
+# class AirtableConfig(Config):
+# DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
+#
+# # Vars and Envs
+# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
+# # env items
+# URL = os.environ.get('PORTAINER_URL')
+# APIKEY = os.environ.get('PORTAINER_KEY')
+# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5)
+#
+# Let's try to use dasgeter aws as the minio configuration
+# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
+# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
+# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
+# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
+# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
+# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+#
+# # set for the earhtcube utiltiies
+# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
+#
+# ,"access_key": GLEANER_MINIO_ACCESS_KEY
+# ,"secret_key": GLEANER_MINIO_SECRET_KEY
+# }
+#
+# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
+# # using GLEANER, even though this is a nabu property... same prefix seems easier
+# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
+# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
+# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
+# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
+# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
+# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
+# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
+# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
+# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
+# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
+# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
+# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
+# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+#
+# SUMMARY_PATH = 'graphs/summary'
+# RELEASE_PATH = 'graphs/latest'
+
+
+class GraphResource(ConfigurableResource):
+ GLEANERIO_GRAPH_URL: str = Field(
+ description="GLEANERIO_GRAPH_URL.")
+ GLEANERIO_GRAPH_NAMESPACE: str = Field(
+ description="GLEANERIO_GRAPH_NAMESPACE.")
+ gs3: gleanerS3Resource
+
+# need multiple namespaces. let's do this.
+ def GraphEndpoint(self, namespace):
+ url = f"{self.GLEANERIO_GRAPH_URL}/namespace/{namespace}/sparql"
+ return url
+
+
+ def post_to_graph(self, source, path='graphs/latest', extension="nq", graphendpoint=None):
+ if graphendpoint is None:
+ graphendpoint = self.GraphEndpoint()
+ # revision of EC utilities, will have a insertFromURL
+ #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
+ proto = "http"
+# this need to get file from s3.
+
+ if self.gs3.GLEANERIO_MINIO_USE_SSL:
+ proto = "https"
+ port = self.gs3.GLEANERIO_MINIO_PORT
+ address = PythonMinioAddress(self.gs3.GLEANERIO_MINIO_ADDRESS, self.gs3.GLEANERIO_MINIO_PORT)
+ bucket = self.gs3.GLEANERIO_MINIO_BUCKET
+ release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
+ # BLAZEGRAPH SPECIFIC
+ # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
+ # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
+ # r = requests.post(url)
+ # log.debug(f' status:{r.status_code}') # status:404
+ # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
+ # if r.status_code == 200:
+ # # ''
+ # if 'data modified="0"' in r.text:
+ # get_dagster_logger().info(f'graph: no data inserted ')
+ # raise Exception("No Data Added: " + r.text)
+ # return True
+ # else:
+ # get_dagster_logger().info(f'graph: error')
+ # raise Exception(f' graph: insert failed: status:{r.status_code}')
+
+ ### GENERIC LOAD FROM
+ url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
+ get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
+ loadfrom = {'update': f'LOAD <{release_url}>'}
+ headers = {
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ }
+ r = requests.post(url, headers=headers, data=loadfrom )
+ get_dagster_logger().debug(f' status:{r.status_code}') # status:404
+ get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
+ if r.status_code == 200:
+ get_dagster_logger().info(f'graph load response: {str(r.text)} ')
+ # ''
+ if 'mutationCount=0' in r.text:
+ get_dagster_logger().info(f'graph: no data inserted ')
+ #raise Exception("No Data Added: " + r.text)
+ return True
+ else:
+ get_dagster_logger().info(f'graph: error {str(r.text)}')
+ raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
+
+class BlazegraphResource(GraphResource):
+ pass
+
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py b/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py
new file mode 100644
index 00000000..68ec81b4
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py
@@ -0,0 +1,5 @@
+
+from .load_on_release_sensor import release_file_sensor, release_file_sensor_v2
+from .gleaner_summon import sources_sensor, sources_schedule
+from .tenant_sensor import tenant_names_sensor, tenant_names_sensor_v2
+from .s3_configs_sensor import sources_s3_sensor, tenant_s3_sensor
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py b/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py
new file mode 100644
index 00000000..27b49a87
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py
@@ -0,0 +1,105 @@
+import os
+
+import dagster
+from dagster import (
+ SensorResult, RunRequest,
+ EventLogEntry, AssetKey, asset_sensor,
+ schedule,ScheduleDefinition,DefaultSensorStatus,DefaultScheduleStatus,
+get_dagster_logger
+)
+from ..assets import (
+ sources_partitions_def
+)
+from ..jobs.summon_assets import summon_asset_job
+
+
+# this monitors the asset. It will harvest a new source
+# the sources_schedule_sensor will add to the weekly schedule
+
+# note on removal of partitions https://github.com/dagster-io/dagster/issues/14026
+@asset_sensor(default_status=DefaultSensorStatus.RUNNING, asset_key=AssetKey(["ingest","sources_names_active"]), job=summon_asset_job
+ # , minimum_interval_seconds=600
+ )
+def sources_sensor(context, asset_event: EventLogEntry):
+ context.log.info(f"sources_sensor: start")
+ assert asset_event.dagster_event and asset_event.dagster_event.asset_key
+ context.log.info(f"asset_key {asset_event.dagster_event.asset_key}")
+# well this is a pain. but it works. Cannot just pass it like you do in ops
+ # otherwise it's just an AssetDefinition.
+ sources = context.repository_def.load_asset_value(AssetKey(["ingest","sources_names_active"]))
+ new_sources = [
+ source
+ for source in sources
+ if not sources_partitions_def.has_partition_key(
+ source, dynamic_partitions_store=context.instance
+ )
+ ]
+ removed_sources = [
+ source
+ for source in sources_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance)
+ if not source in sources
+ ]
+ for s in removed_sources:
+ context.instance.delete_dynamic_partition("sources_names_active", s)
+ context.log.info(f"new sources {new_sources}")
+ context.log.info(f"Removed {removed_sources}")
+ return SensorResult(
+ run_requests=[
+ RunRequest(partition_key=source
+ # , job_name=f"{source}_load"
+ , run_key=f"{source}_load"
+ ) for source in new_sources
+ ],
+ dynamic_partitions_requests=[
+ sources_partitions_def.build_add_request(new_sources)
+ ],
+ )
+#
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules#static-partitioned-jobs
+# humm https://github.com/dagster-io/dagster/blob/567cb59f1da819bbb8522108fc2c2a3bace6c7b3/python_modules/dagster-test/dagster_test/toys/schedules.py#L41
+
+# # so this needs to be a schedule, and we handle the cron by ourselves.)
+sched = os.environ.get("GLEANERIO_DEFAULT_SCHEDULE", "@weekly")
+sched_timezone = os.environ.get("GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE", "America/Los_Angeles")
+@schedule(job=summon_asset_job, cron_schedule=sched,execution_timezone=sched_timezone,
+ default_status=DefaultScheduleStatus.RUNNING,
+ )
+def sources_schedule(context):
+ partition_keys = sources_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance)
+ get_dagster_logger().info(str(partition_keys))
+ return [
+ RunRequest(
+ partition_key=partition_key,
+ # run_key=f"{context.scheduled_execution_time}_{partition_key}"
+ run_key=f"summon_asset_{partition_key}"
+ )
+ for partition_key in partition_keys
+ ]
+
+
+
+# from dagster import sensor, RunRequest, SensorExecutionContext
+# from dagster import (DynamicPartitionsDefinition, job)
+# # Define your dynamic partitions
+# fruits = DynamicPartitionsDefinition(name="fruits")
+# # Define a job that will process the partitions
+# @job()
+# def my_job():
+# # Your job logic here
+# pass
+# # Define a sensor that triggers the job and updates the partitions
+# @sensor(job=my_job)
+# def my_sensor(context: SensorExecutionContext):
+# # Logic to determine if there are new partitions to add
+# # For example, check a directory for new files, query a database, etc.
+# new_partitions = ["apple", "banana"]
+# # Replace with your dynamic logic
+# # Build add requests for the new partitions
+# dynamic_partitions_requests = [fruits.build_add_request(new_partitions)]
+# # Create a run request for each new partition
+# run_requests = [RunRequest(partition_key=partition) for partition in new_partitions]
+# # Return the sensor result with run requests and dynamic partition requests
+# return SensorResult(
+# run_requests=run_requests,
+# dynamic_partitions_requests=dynamic_partitions_requests
+# )
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/harvest_sched.py b/dagster/implnets/workflows/ingest/ingest/sensors/harvest_sched.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py
new file mode 100644
index 00000000..17ab3b5a
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py
@@ -0,0 +1,129 @@
+from dagster import (
+op, job, Config,get_dagster_logger,DefaultSensorStatus,
+sensor, RunRequest, RunConfig,
+SensorEvaluationContext,
+SkipReason,
+AssetKey,
+static_partitioned_config,
+asset_sensor, multi_asset_sensor,
+EventLogEntry
+)
+from dagster_aws.s3.sensor import get_s3_keys
+from typing import List, Dict
+from pydantic import Field
+
+from ..resources.gleanerio import GleanerioResource
+from ..resources.gleanerS3 import gleanerS3Resource
+from ..resources.graph import BlazegraphResource
+from ..assets import tenant_partitions_def,TenantConfig
+from ..jobs.tenant_load import release_asset_job, create_graph_namespaces
+from ..assets.gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH
+
+#from ..jobs.tennant_load import build_community
+# This sensor needs to detect when an source has completed its' run
+# and then load the data into the client's graphstore.
+
+
+
+# #######
+# Put the config for a tennant at the job level so we only have to define it once
+######
+
+
+
+
+
+#@sensor(job=build_community,minimum_interval_seconds=60)
+
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors#using-resources-in-sensors
+# sensor factor example
+# https://github.com/dagster-io/dagster/blob/master/examples/project_fully_featured/project_fully_featured/sensors/hn_tables_updated_sensor.py
+######
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/asset-sensors#when-all-partitions-have-new-materializations
+########
+
+# @asset_sensor(asset_key=AssetKey(["ingest","release_summarize"]),
+# default_status=DefaultSensorStatus.RUNNING,
+# job=release_asset_job, required_resource_keys={"gleanerio"},
+# # minimum_interval_seconds=3600
+# )
+@multi_asset_sensor(
+ monitored_assets=[
+ AssetKey(["ingest","release_summarize"])
+ ],
+ job=release_asset_job,
+ required_resource_keys={"gleanerio"}
+)
+def release_file_sensor_v2(context
+ #,asset_event: EventLogEntry
+ ):
+ # assert asset_event.dagster_event and asset_event.dagster_event.asset_key
+
+ run_requests = []
+ # source_name = asset_event.dagster_event.partition
+ # source_key= asset_event.dagster_event.asset_key
+ # context.log.info(f"partition_key: {source_name} source_key: {source_key}")
+
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ since_key = context.cursor or None
+ context.log.info(f"sinceKey: {since_key}")
+
+ # run_requests = [RunRequest(
+ # partition_key=source_name,
+ # run_key=f"{source_name}_upload_release_{since_key}",
+ # run_config={})]
+ # #context.update_cursor(since_key+1)
+ # context.update_cursor(since_key)
+ # context.log.info(f"sinceKey new: {context.cursor}")
+ # return run_requests
+ for (
+ partition,
+ materializations_by_asset,
+ ) in context.latest_materialization_records_by_partition_and_asset().items():
+ if set(materializations_by_asset.keys()) == set(context.asset_keys):
+ run_requests.append(RunRequest(partition_key=partition,
+ run_key=f"{partition}_upload_release_{since_key}",)
+ )
+ for asset_key, materialization in materializations_by_asset.items():
+ context.advance_cursor({asset_key: materialization})
+ return run_requests
+@asset_sensor(asset_key=AssetKey(["ingest","release_summarize"]),
+ # default_status=DefaultSensorStatus.RUNNING,
+ job=release_asset_job, required_resource_keys={"gleanerio"},
+ # minimum_interval_seconds=3600
+ )
+def release_file_sensor(context,config: TenantConfig
+ ):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ since_key = context.cursor or None
+ context.log.info(f"sinceKey: {since_key}")
+ #new_s3_keys = get_s3_keys(gleaner_s3.GLEANERIO_MINIO_BUCKET, prefix=SUMMARY_PATH, since_key=since_key)
+ if since_key is None:
+ new_s3_keys = s3_resource.get_client().list_objects_v2(
+ Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ Prefix=SUMMARY_PATH
+ )
+ else:
+ new_s3_keys = s3_resource.get_client().list_objects_v2(
+ Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ Prefix=SUMMARY_PATH,
+ StartAfter=since_key
+ )
+ new_s3_keys = list(new_s3_keys)
+ context.log.info(f"keys: {new_s3_keys}")
+ if not new_s3_keys:
+ return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}.")
+ context.log.info(f"new key len: {len(new_s3_keys)}")
+ last_key = new_s3_keys[-1]
+
+ run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys]
+ context.update_cursor(last_key)
+ #context.update_cursor()
+ context.log.info(f"new sinceKey: {context.cursor}")
+ return run_requests
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py
new file mode 100644
index 00000000..3ad2d07d
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py
@@ -0,0 +1,134 @@
+from dagster import (
+op, job, Config,get_dagster_logger,
+sensor, RunRequest, RunConfig,
+SensorEvaluationContext,asset_sensor, EventLogEntry,
+SkipReason,
+AssetKey,
+static_partitioned_config,
+DefaultSensorStatus
+)
+from dagster_aws.s3.sensor import get_s3_keys
+from typing import List, Dict
+from pydantic import Field
+
+from ..resources.gleanerio import GleanerioResource
+from ..resources.gleanerS3 import gleanerS3Resource
+from ..resources.graph import BlazegraphResource
+from ..assets import tenant_partitions_def,TenantConfig
+from ..jobs.tenant_load import (release_asset_job, create_graph_namespaces, tenant_asset_job)
+from ..jobs.summon_assets import sources_asset_job
+from ..assets.gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH
+
+#from ..jobs.tennant_load import build_community
+# This sensor needs to detect when an source has completed its' run
+# and then load the data into the client's graphstore.
+
+
+
+# #######
+# Put the config for a tennant at the job level so we only have to define it once
+######
+
+
+
+
+
+#@sensor(job=build_community,minimum_interval_seconds=60)
+
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors#using-resources-in-sensors
+# sensor factor example
+# https://github.com/dagster-io/dagster/blob/master/examples/project_fully_featured/project_fully_featured/sensors/hn_tables_updated_sensor.py
+######
+# https://docs.dagster.io/concepts/partitions-schedules-sensors/asset-sensors#when-all-partitions-have-new-materializations
+########
+@sensor(name="s3_config_source_sensor",
+ default_status=DefaultSensorStatus.RUNNING,
+ #, job_name="sources_updated_job",
+ job=sources_asset_job,
+ required_resource_keys={"gleanerio"},
+ # minimum_interval_seconds=3600
+ )
+def sources_s3_sensor(context
+ ):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ since_key = context.cursor or None
+ get_dagster_logger().info(f"sinceKey: {since_key}")
+ config_path=f"{gleaner_s3.GLEANERIO_CONFIG_PATH}"
+ filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_SOURCES_FILENAME}"
+
+ new_s3_keys = s3_resource.get_client().head_object(
+ Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ Key=filename,
+
+ )
+
+ # new_s3_keys = s3_resource.resource.ObjectSummary(
+ # Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ # Key=filename,
+ #
+ # )
+
+ # since_key = context.cursor or None
+ # new_s3_keys = get_s3_keys("my_s3_bucket", since_key=since_key)
+
+ if not new_s3_keys:
+ return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}")
+ get_dagster_logger().info(f"metadata {new_s3_keys}")
+ #new_s3_keys = list(new_s3_keys)
+ last_key = str(new_s3_keys['LastModified'])
+ get_dagster_logger().info(f"last_modified: {last_key}")
+ run_requests =[]
+ if since_key is None or since_key < last_key:
+ #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys]
+ run_requests = [RunRequest(run_key=last_key, run_config={})]
+ context.update_cursor(last_key)
+ return run_requests
+
+@sensor(name="s3_configs_tenant__sensor",
+ default_status=DefaultSensorStatus.RUNNING,
+ #, job_name="sources_updated_job",
+ job=tenant_asset_job,
+ required_resource_keys={"gleanerio"},
+ # minimum_interval_seconds=3600
+ )
+def tenant_s3_sensor(context
+ ):
+ gleaner_resource = context.resources.gleanerio
+ s3_resource = context.resources.gleanerio.gs3.s3
+ gleaner_s3 = context.resources.gleanerio.gs3
+ triplestore = context.resources.gleanerio.triplestore
+ since_key = context.cursor or None
+ get_dagster_logger().info(f"sinceKey: {since_key}")
+ config_path=f"{gleaner_s3.GLEANERIO_CONFIG_PATH}"
+ filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_TENANT_FILENAME}"
+
+ new_s3_keys = s3_resource.get_client().head_object(
+ Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ Key=filename,
+
+ )
+
+ # new_s3_keys = s3_resource.resource.ObjectSummary(
+ # Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ # Key=filename,
+ #
+ # )
+
+ # since_key = context.cursor or None
+ # new_s3_keys = get_s3_keys("my_s3_bucket", since_key=since_key)
+
+ if not new_s3_keys:
+ return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}")
+ get_dagster_logger().info(f"metadata {new_s3_keys}")
+ #new_s3_keys = list(new_s3_keys)
+ last_key = str(new_s3_keys['LastModified'])
+ get_dagster_logger().info(f"last_modified: {last_key}")
+ run_requests =[]
+ if since_key is None or since_key < last_key:
+ #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys]
+ run_requests = [RunRequest(run_key=last_key, run_config={})]
+ context.update_cursor(last_key)
+ return run_requests
diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py
new file mode 100644
index 00000000..42fd704e
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py
@@ -0,0 +1,104 @@
+from dagster import (
+op, job, Config,get_dagster_logger,
+sensor, RunRequest, RunConfig,SensorResult,
+SensorEvaluationContext,asset_sensor, EventLogEntry,
+SkipReason,
+AssetKey,
+static_partitioned_config,DynamicPartitionsDefinition,
+DefaultSensorStatus,DefaultScheduleStatus
+)
+from ..jobs.tenant_load import tenant_namespaces_job, release_asset_job
+from ..assets import tenant_partitions_def
+#from ..assets.tenant import build_community
+
+## Thinking. Doing this the wrong way.
+## for each source, we dynamically generate a set of tenants to load, rather than for each tenant we reload
+## So, at the end of a source load, we trigger a load tenants.
+## this figures out what tenants to load, and call those ops.
+
+## So the asset key is not tenant names, it is still source_names_active.
+
+# now we do need to build tenants when a new tenant is added.
+# this should just handle the cretion of namespaces, and adding the UI's
+
+@asset_sensor( asset_key=AssetKey(["ingest","tenant_names"]),
+ default_status=DefaultSensorStatus.RUNNING,
+#default_status=DefaultScheduleStatus.RUNNING,
+ job=tenant_namespaces_job,
+ # jobs=[tenant_namespaces_job,release_asset_job]
+ # , minimum_interval_seconds=600
+ )
+def tenant_names_sensor(context, asset_event: EventLogEntry):
+ context.log.info(f"tenant_names_sensor: start")
+ assert asset_event.dagster_event and asset_event.dagster_event.asset_key
+ context.log.info(f"asset_key: {asset_event.dagster_event.asset_key}")
+# well this is a pain. but it works. Cannot just pass it like you do in ops
+ # otherwise it's just an AssetDefinition.
+ tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_names"]))
+ new_tenants = [
+ tenant
+ for tenant in tenants
+ if not tenant_partitions_def.has_partition_key(
+ tenant, dynamic_partitions_store=context.instance
+ )
+ ]
+ removed_tenants = [
+ tenant
+ for tenant in tenant_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance)
+ if not tenant in tenants
+ ]
+ for t in removed_tenants:
+ context.instance.delete_dynamic_partition("tenant_names_paritition", t)
+ context.log.info(f"Removed {removed_tenants}")
+ context.log.info(f"new tenant {new_tenants}")
+ return SensorResult(
+ run_requests=[
+ RunRequest(partition_key=tenant
+ # , job_name=f"{source}_load"
+ , run_key=f"{tenant}_tenant"
+ ) for tenant in new_tenants
+ ],
+ dynamic_partitions_requests=[
+ tenant_partitions_def.build_add_request(new_tenants)
+ ],
+ )
+
+@asset_sensor( asset_key=AssetKey(["ingest","tenant_names"]),
+ default_status=DefaultSensorStatus.RUNNING,
+#default_status=DefaultScheduleStatus.RUNNING,
+ # job=tenant_namespaces_job,
+ jobs=[tenant_namespaces_job,release_asset_job]
+ # , minimum_interval_seconds=600
+ )
+def tenant_names_sensor_v2(context, asset_event: EventLogEntry):
+ assert asset_event.dagster_event and asset_event.dagster_event.asset_key
+
+# well this is a pain. but it works. Cannot just pass it like you do in ops
+ # otherwise it's just an AssetDefinition.
+ tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_names"]))
+ new_tenants = [
+ tenant
+ for tenant in tenants
+ if not tenant_partitions_def.has_partition_key(
+ tenant, dynamic_partitions_store=context.instance
+ )
+ ]
+# in order for this to work, the tenant_release_job needs to be fed valid sources,
+# from some aggreate from the sources in the new_tenants[*]['sources']
+
+ return SensorResult(
+ run_requests=[
+ RunRequest(partition_key=tenant
+ , job_name="tenant_namespaces_job"
+ , run_key=f"{tenant}_tenant_namespace"
+ ) for tenant in new_tenants
+ ] + [
+ RunRequest(partition_key=tenant
+ , job_name="tenant_release_job"
+ , run_key=f"{tenant}_tenant_release"
+ ) for tenant in new_tenants
+ ],
+ dynamic_partitions_requests=[
+ tenant_partitions_def.build_add_request(new_tenants)
+ ],
+ )
diff --git a/dagster/implnets/workflows/ingest/ingest/utils.py b/dagster/implnets/workflows/ingest/ingest/utils.py
new file mode 100644
index 00000000..df35ee78
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest/utils.py
@@ -0,0 +1,10 @@
+
+
+def PythonMinioAddress(url, port=None):
+ if (url.endswith(".amazonaws.com")):
+ PYTHON_MINIO_URL = "s3.amazonaws.com"
+ else:
+ PYTHON_MINIO_URL = url
+ if port is not None:
+ PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
+ return PYTHON_MINIO_URL
diff --git a/dagster/implnets/workflows/ingest/ingest_tests/__init__.py b/dagster/implnets/workflows/ingest/ingest_tests/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest_tests/__init__.py
@@ -0,0 +1 @@
+
diff --git a/dagster/implnets/workflows/ingest/ingest_tests/config.yaml b/dagster/implnets/workflows/ingest/ingest_tests/config.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py b/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py
@@ -0,0 +1 @@
+
diff --git a/dagster/implnets/workflows/ingest/pyproject.toml b/dagster/implnets/workflows/ingest/pyproject.toml
new file mode 100644
index 00000000..30442bfc
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.dagster]
+module_name = "ingest"
diff --git a/dagster/implnets/workflows/ingest/setup.cfg b/dagster/implnets/workflows/ingest/setup.cfg
new file mode 100644
index 00000000..e79daae3
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+name = tutorial
diff --git a/dagster/implnets/workflows/ingest/setup.py b/dagster/implnets/workflows/ingest/setup.py
new file mode 100644
index 00000000..e38cdd3c
--- /dev/null
+++ b/dagster/implnets/workflows/ingest/setup.py
@@ -0,0 +1,12 @@
+from setuptools import find_packages, setup
+
+setup(
+ name="tutorial",
+ packages=find_packages(exclude=["tutorial_tests"]),
+ install_requires=[
+ "dagster",
+ "dagster-cloud",
+ "Faker==18.4.0",
+ ],
+ extras_require={"dev": ["dagit", "pytest"]},
+)
diff --git a/dagster/implnets/workflows/tasks/data/source_list.json b/dagster/implnets/workflows/tasks/data/source_list.json
new file mode 100644
index 00000000..0637a088
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/data/source_list.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/dagster/implnets/workflows/tasks/tasks/__init__.py b/dagster/implnets/workflows/tasks/tasks/__init__.py
index a36556aa..6d55696d 100644
--- a/dagster/implnets/workflows/tasks/tasks/__init__.py
+++ b/dagster/implnets/workflows/tasks/tasks/__init__.py
@@ -1,9 +1,86 @@
-from dagster import Definitions, load_assets_from_modules
-
+import os
+from distutils.util import strtobool
+from dagster import Definitions, load_assets_from_modules, EnvVar
+from dagster_aws.s3 import S3Resource
+#from dagster_slack import SlackResource, make_slack_on_run_failure_sensor
from . import assets
+from .sch import weekly_sch
+from .sch.s3_sensor import tenant_s3_sensor
+from .assets.tenants import community_sensor
+
+from .resources.graph import BlazegraphResource, GraphResource
+from .resources.gleanerS3 import gleanerS3Resource
+
+from dagster_slack import SlackResource, make_slack_on_run_failure_sensor
+slack_on_run_failure = make_slack_on_run_failure_sensor(
+ os.getenv("SLACK_CHANNEL"),
+ os.getenv("SLACK_TOKEN")
+)
+def _awsEndpointAddress(url, port=None, use_ssl=True):
+ if use_ssl:
+ protocol = "https"
+ else:
+ protocol = "http"
+ if port is not None:
+ return f"{protocol}://{url}:{port}"
+ else:
+ return f"{protocol}://{url}"
all_assets = load_assets_from_modules([assets])
+# as noted: https://docs.dagster.io/concepts/assets/software-defined-assets#from-assets-in-a-sub-module
+# tried to use load_assets_from_modules([assets] , key_prefix=["tasks"])
+# this meant that the prefix had to included in the code... so, just add it individually
+weekly_data_schedule=[ weekly_sch.loadstats_schedule, weekly_sch.all_graph_stats_schedule]
+s3 = S3Resource(
+ endpoint_url=_awsEndpointAddress(EnvVar('GLEANERIO_MINIO_ADDRESS').get_value(),
+ port=EnvVar('GLEANERIO_MINIO_PORT').get_value()),
+ aws_access_key_id=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'),
+ aws_secret_access_key=EnvVar('GLEANERIO_MINIO_SECRET_KEY')
+)
+minio=gleanerS3Resource(
+ s3=s3,
+ # GLEANER_MINIO_BUCKET =EnvVar('GLEANER_MINIO_BUCKET'),
+ # GLEANER_MINIO_ADDRESS=EnvVar('GLEANER_MINIO_ADDRESS'),
+ # GLEANER_MINIO_PORT=EnvVar('GLEANER_MINIO_PORT'),
+
+ GLEANERIO_MINIO_BUCKET=EnvVar('GLEANERIO_MINIO_BUCKET'),
+ GLEANERIO_MINIO_ADDRESS=EnvVar('GLEANERIO_MINIO_ADDRESS'),
+ GLEANERIO_MINIO_PORT=EnvVar('GLEANERIO_MINIO_PORT'),
+ GLEANERIO_MINIO_ACCESS_KEY=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'),
+ GLEANERIO_MINIO_SECRET_KEY=EnvVar('GLEANERIO_MINIO_SECRET_KEY'),
+ GLEANERIO_CONFIG_PATH=EnvVar('GLEANERIO_CONFIG_PATH'),
+ GLEANERIO_TENANT_FILENAME=EnvVar('GLEANERIO_TENANT_FILENAME')
+
+)
+triplestore=BlazegraphResource(
+ GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'),
+ GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'),
+ GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'),
+ GLEANERIO_GRAPH_SUMMARIZE=EnvVar('GLEANERIO_GRAPH_SUMMARIZE'),
+ s3=minio,
+ )
+
+
+resources = {
+ "local": {
+
+ "s3":minio,
+ "triplestore": triplestore,
+ # "slack": SlackResource(token=EnvVar("SLACK_TOKEN")),
+ },
+ "production": {
+
+ "s3":minio,
+ "triplestore":triplestore,
+ # "slack":SlackResource(token=EnvVar("SLACK_TOKEN")),
+ },
+}
+
+deployment_name = os.environ.get("DAGSTER_DEPLOYMENT", "local")
defs = Definitions(
assets=all_assets,
+ schedules=weekly_data_schedule,
+ resources=resources[deployment_name],
+ sensors=[community_sensor, tenant_s3_sensor, slack_on_run_failure]
)
diff --git a/dagster/implnets/workflows/tasks/tasks/assets.py b/dagster/implnets/workflows/tasks/tasks/assets.py
deleted file mode 100644
index 7260b45a..00000000
--- a/dagster/implnets/workflows/tasks/tasks/assets.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import json
-import os
-
-import pandas as pd
-from dagster import asset, get_dagster_logger
-from ec.datastore import s3
-
-GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS')
-GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT')
-GLEANER_MINIO_USE_SSL = os.environ.get('GLEANERIO_MINIO_USE_SSL')
-GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY')
-GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')
-GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET')
-# set for the earhtcube utiltiies
-MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
-
- ,"access_key": GLEANER_MINIO_ACCESS_KEY
- ,"secret_key": GLEANER_MINIO_SECRET_KEY
- }
-REPORT_PATH = "reports/"
-ORG_PATH = "orgs/"
-STAT_FILE_NAME = "missing_report_graph.json"
-def _pythonMinioUrl(url):
-
- if (url.endswith(".amazonaws.com")):
- PYTHON_MINIO_URL = "s3.amazonaws.com"
- else:
- PYTHON_MINIO_URL = url
- return PYTHON_MINIO_URL
-
-def getName(name):
- return name.replace("orgs/","").replace(".nq","")
-@asset()
-def source_list() -> None:
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS)
- orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)
- sources = map( lambda f: { "name": getName(f.object_name)}, orglist )
-
- os.makedirs("data", exist_ok=True)
-
-
- with open("data/source_list.json", "w") as f:
- json.dump(list(sources), f)
-#@asset(deps=[source_list])
-@asset(deps=[source_list])
-def loadstats() -> None:
- logger = get_dagster_logger()
- s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS),MINIO_OPTIONS)
- # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False))
-
- with open("data/source_list.json","r" ) as f:
- sourcelist = json.load(f)
- stats = []
- for source in sourcelist:
- try:
- stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME )
- stat = json.loads(stat)
- stats.append(stat)
- except:
- logger.info(f"Failed to get { source.get('name')} ")
- df = pd.DataFrame(stats)
- df.to_csv("data/weekly_stats.csv")
diff --git a/dagster/implnets/workflows/tasks/tasks/assets/__init__.py b/dagster/implnets/workflows/tasks/tasks/assets/__init__.py
new file mode 100644
index 00000000..50ddb949
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/assets/__init__.py
@@ -0,0 +1,3 @@
+from .source_stats import source_list, loadstatsHistory
+from .all_graph_stats import sos_types, all_report_stats
+from .tenants import task_tenant_sources, task_tenant_names, loadstatsCommunity
diff --git a/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py b/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py
new file mode 100644
index 00000000..63f1dcaf
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py
@@ -0,0 +1,123 @@
+from distutils import util
+import json
+import os
+
+from dagster import asset, define_asset_job, get_dagster_logger, AssetKey
+from ec.graph.sparql_query import queryWithSparql
+from ec.reporting.report import generateGraphReportsRepo, reportTypes, generateReportStats
+from ec.datastore import s3
+from ec.logger import config_app
+from .tenants import task_tenant_names
+from pydash import find
+
+log = config_app()
+
+
+# GLEANERIO_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
+# GLEANERIO_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
+# GLEANERIO_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true')))
+# GLEANERIO_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
+# GLEANERIO_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
+# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+#
+# # set for the earhtcube utiltiies
+# MINIO_OPTIONS={"secure":GLEANERIO_MINIO_USE_SSL
+#
+# ,"access_key": GLEANERIO_MINIO_ACCESS_KEY
+# ,"secret_key": GLEANERIO_MINIO_SECRET_KEY
+# }
+#
+# GLEANERIO_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
+# # using GLEANER, even though this is a nabu property... same prefix seems easier
+# GLEANERIO_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
+# GLEANERIO_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
+# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
+# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
+# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
+# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
+# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
+# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
+# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
+# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
+# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
+# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
+# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
+# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_NAMESPACE',f"{GLEANERIO_GRAPH_NAMESPACE}_summary" )
+# GLEANERIO_SUMMARIZE_GRAPH=(os.getenv('GLEANERIO_GRAPH_SUMMARIZE', 'False').lower() == 'true')
+# GLEANERIO_CSV_CONFIG_URL = str(os.environ.get('GLEANERIO_CSV_CONFIG_URL'))
+
+SUMMARY_PATH = 'graphs/summary'
+RELEASE_PATH = 'graphs/latest'
+
+def _graphSummaryEndpoint(community, graph_resoruce):
+ if community == "all":
+ url = f"{graph_resoruce.GLEANERIO_GRAPH_URL}/namespace/{graph_resoruce.GLEANERIO_GRAPH_SUMMARY_NAMESPACE}/sparql"
+ else:
+ url = f"{graph_resoruce.GLEANERIO_GRAPH_URL}/namespace/{community}_summary/sparql"
+ return url
+@asset(group_name="graph",key_prefix="task", required_resource_keys={"triplestore"})
+def sos_types(context ):
+ s3_resource = context.resources.triplestore.s3
+ graph_resource = context.resources.triplestore
+ graphendpoint = f"{graph_resource.GLEANERIO_GRAPH_URL}/namespace/{graph_resource.GLEANERIO_GRAPH_NAMESPACE}/sparql"
+ get_dagster_logger().info("sos types endpoint: {}".format(graphendpoint))
+ report = queryWithSparql("all_count_types", graphendpoint, parameters=None)
+ report_csv =report.to_csv()
+ # report_json = generateGraphReportsRepo("all",
+ # "", reportList=reportTypes["all"])
+ MINIO_OPTIONS={"secure":s3_resource.GLEANERIO_MINIO_USE_SSL
+
+ ,"access_key": s3_resource.GLEANERIO_MINIO_ACCESS_KEY
+ ,"secret_key": s3_resource.GLEANERIO_MINIO_SECRET_KEY
+ }
+ s3Minio = s3.MinioDatastore( s3_resource.GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS)
+ #data = f.getvalue()
+
+ bucketname, objectname = s3Minio.putReportFile(s3_resource.GLEANERIO_MINIO_BUCKET,"all","sos_types.csv",report_csv)
+ return bucketname, objectname, report_csv
+
+#@asset(group_name="graph",key_prefix="task", required_resource_keys={"triplestore"})
+def all_report_stats(context, task_tenant_names):
+ s3_resource = context.resources.triplestore.s3
+ graph_resource = context.resources.triplestore
+ MINIO_OPTIONS={"secure":s3_resource.GLEANERIO_MINIO_USE_SSL
+
+ ,"access_key": s3_resource.GLEANERIO_MINIO_ACCESS_KEY
+ ,"secret_key": s3_resource.GLEANERIO_MINIO_SECRET_KEY
+ }
+ s3Minio = s3.MinioDatastore( s3_resource.GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS)
+ bucket = s3_resource.GLEANERIO_MINIO_BUCKET
+ # this is a file with a list of sources for a community. T
+ # this now exists in the tenant configuration file.
+ #source_url = s3_resource.GLEANERIO_CSV_CONFIG_URL
+
+ tenants_all = context.repository_def.load_asset_value(AssetKey("tenant_all"))['tenant']
+
+ # TODO: remove the hardcoded community list
+ #community_list = ["all", "deepoceans", "ecoforecast", "geochemistry"]
+ #community_list = context.repository_def.load_asset_value(AssetKey("tenant_names"))
+ community_list = task_tenant_names
+ if (graph_resource.GLEANERIO_GRAPH_SUMMARIZE):
+ for community in community_list:
+ community_tenant = find(tenants_all, lambda x: x["community"] == community)
+ community_sources = community_tenant.get('sources')
+ try:
+ # update generateReportStats to take an array of source names
+ report = generateReportStats(community_sources, bucket, s3Minio, _graphSummaryEndpoint(community), community)
+ bucketname, objectname = s3Minio.putReportFile(bucket, "all", f"report_{community}_stats.json", report)
+ except Exception as e:
+ get_dagster_logger().info(f"Summary report errors: {str(e)}")
+
+#all_urn_w_types_toplevel.sparql
+# returns all grapurns with a type.
+# def top_level_types():
+# graphendpoint = f"{GLEANERIO_GRAPH_URL}/namespace/{GLEANERIO_GRAPH_NAMESPACE}/sparql"
+# report = queryWithSparql("all_urn_w_types_toplevel", graphendpoint, parameters=None)
+# report_csv =report.to_csv()
+# # report_json = generateGraphReportsRepo("all",
+# # "", reportList=reportTypes["all"])
+# s3Minio = s3.MinioDatastore( GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS)
+
+
diff --git a/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py b/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py
new file mode 100644
index 00000000..2c465d77
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py
@@ -0,0 +1,96 @@
+import distutils
+import json
+import os
+from typing import List, Any
+import pandas as pd
+from dagster import asset, get_dagster_logger, define_asset_job
+from ec.datastore import s3
+from pydash import pick
+from distutils import util
+
+GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS')
+GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT')
+GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true')))
+GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY')
+GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')
+GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET')
+# set for the earhtcube utiltiies
+MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
+
+ ,"access_key": GLEANER_MINIO_ACCESS_KEY
+ ,"secret_key": GLEANER_MINIO_SECRET_KEY
+ }
+REPORT_PATH = "reports/"
+TASKS_PATH="tasks/"
+ORG_PATH = "orgs/"
+STAT_FILE_NAME = "load_report_graph.json"
+def _pythonMinioUrl(url):
+
+ if (url.endswith(".amazonaws.com")):
+ PYTHON_MINIO_URL = "s3.amazonaws.com"
+ else:
+ PYTHON_MINIO_URL = url
+ return PYTHON_MINIO_URL
+
+def getName(name):
+ return name.replace("orgs/","").replace(".nq","")
+@asset(group_name="load",key_prefix="task",)
+def source_list() -> List[Any]:
+ s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS)
+ orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)
+ sources = map( lambda f: { "name": getName(f.object_name)}, orglist )
+ sources=list(sources)
+ source_json = json.dumps(sources)
+ os.makedirs("data", exist_ok=True)
+
+ s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"source_list.json", source_json )
+ # with open("data/source_list.json", "w") as f:
+ # json.dump(list(sources), f)
+ return sources
+#@asset(deps=[source_list])
+
+# set a prefix so we can have some named stats file
+
+#@asset( group_name="load",key_prefix="task",)
+@asset(group_name="load",key_prefix="task",)
+def loadstatsHistory(context,source_list) -> str:
+ prefix="history"
+ logger = get_dagster_logger()
+ s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS),MINIO_OPTIONS)
+ # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False))
+
+ # with open("data/source_list.json","r" ) as f:
+ # sourcelist = json.load(f)
+ sourcelist=source_list
+ stats = []
+ for source in sourcelist:
+ try:
+ # stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME )
+ repo = source.get("name")
+ dirs = s3Minio.listPath( GLEANER_MINIO_BUCKET,f"{REPORT_PATH}{repo}/",recursive=False )
+ for d in dirs:
+ latestpath = f"{REPORT_PATH}{repo}/latest/"
+ if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False):
+ continue
+ path = f"{d.object_name}{STAT_FILE_NAME}"
+ s3ObjectInfo = {"bucket_name": GLEANER_MINIO_BUCKET, "object_name": path}
+ try:
+ resp = s3Minio.getFileFromStore(s3ObjectInfo)
+ stat = json.loads(resp)
+ stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count',
+ 'missing_sitemap_summon_count',
+ 'graph_urn_count', 'missing_summon_graph_count')
+ stats.append(stat)
+ except Exception as ex:
+ logger.info(f"no missing graph report {source.get('name')} {ex}")
+ except Exception as ex:
+ logger.info(f"Failed to get { source.get('name')} {ex}")
+ df = pd.DataFrame(stats)
+ df.to_csv(f"data/all_stats.csv")
+ df_csv = df.to_csv()
+ s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"all_stats.csv", df_csv)
+ context.log.info(f"all_stats.csv uploaded using putReportFile s3://{GLEANER_MINIO_BUCKET} all ")
+ #return df_csv
+ return df_csv
+
+
diff --git a/dagster/implnets/workflows/tasks/tasks/assets/tenants.py b/dagster/implnets/workflows/tasks/tasks/assets/tenants.py
new file mode 100644
index 00000000..8eab8fcc
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/assets/tenants.py
@@ -0,0 +1,231 @@
+import json
+from typing import Any
+from io import StringIO
+import yaml
+import os
+import pandas as pd
+from pydash import pick
+from dagster import (asset,
+ get_dagster_logger,
+ Output,
+ DynamicPartitionsDefinition,
+ define_asset_job,
+ AssetSelection,
+ sensor,SensorResult,DefaultSensorStatus,
+ RunRequest,
+asset_sensor, AssetKey,
+ )
+from ec.datastore import s3
+from distutils import util
+from ..resources.gleanerS3 import _pythonMinioAddress
+from ec.reporting.report import generateReportStats
+
+GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS')
+GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT')
+GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true')))
+GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY')
+GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')
+GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET')
+GLEANERIO_GRAPH_URL = os.environ.get('GLEANERIO_GRAPH_URL')
+GLEANERIO_GRAPH_SUMMARY_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_NAMESPACE')
+GLEANERIO_CSV_CONFIG_URL = os.environ.get('GLEANERIO_CSV_CONFIG_URL')
+
+MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
+
+ ,"access_key": GLEANER_MINIO_ACCESS_KEY
+ ,"secret_key": GLEANER_MINIO_SECRET_KEY
+ }
+
+def _graphSummaryEndpoint(community):
+ if community == "all":
+ url = f"{GLEANERIO_GRAPH_URL}/namespace/{GLEANERIO_GRAPH_SUMMARY_NAMESPACE}/sparql"
+ else:
+ url = f"{GLEANERIO_GRAPH_URL}/namespace/{community}_summary/sparql"
+ return url
+@asset(group_name="community",key_prefix="task",
+ required_resource_keys={"triplestore"})
+def task_tenant_sources(context) ->Any:
+ s3_resource = context.resources.triplestore.s3
+ t=s3_resource.getTennatInfo()
+ tenants = t['tenant']
+ listTenants = map (lambda a: {a['community']}, tenants)
+ get_dagster_logger().info(str(t))
+
+ return t
+ # metadata={
+ # "tennants": str(listTenants), # Metadata can be any key-value pair
+ # "run": "gleaner",
+ # # The `MetadataValue` class has useful static methods to build Metadata
+ # }
+ # )
+@asset(group_name="community",key_prefix="task",
+ #name='task_tenant_names',
+ required_resource_keys={"triplestore"})
+def task_tenant_names(context, task_tenant_sources) -> Output[Any]:
+
+ tenants = task_tenant_sources['tenant']
+ listTenants = map (lambda a: a['community'], tenants)
+ get_dagster_logger().info(str(listTenants))
+ communities = list(listTenants)
+ return Output(
+ communities,
+ metadata={
+ "tenants": str(listTenants), # Metadata can be any key-value pair
+ "run": "gleaner",
+ # The `MetadataValue` class has useful static methods to build Metadata
+ }
+ )
+
+
+community_partitions_def = DynamicPartitionsDefinition(name="tenantsPartition")
+tenant_task_job = define_asset_job(
+ "tenant_job", AssetSelection.keys(AssetKey(["task","loadstatsCommunity"])), partitions_def=community_partitions_def
+)
+#@sensor(job=tenant_job)
+@asset_sensor(asset_key=AssetKey(["task","task_tenant_names"]),
+ default_status=DefaultSensorStatus.RUNNING,
+ job=tenant_task_job)
+def community_sensor(context):
+ tenants = context.repository_def.load_asset_value(AssetKey(["task","task_tenant_names"]))
+ new_community = [
+ community
+ for community in tenants
+ if not context.instance.has_dynamic_partition(
+ community_partitions_def.name, community
+ )
+ ]
+
+ return SensorResult(
+ run_requests=[
+ RunRequest(partition_key=community) for community in new_community
+ ],
+ dynamic_partitions_requests=[
+ community_partitions_def.build_add_request(new_community)
+ ],
+ )
+REPORT_PATH = "reports/"
+COMMUNITY_PATH = "reports/community/"
+TASKS_PATH="tasks/"
+ORG_PATH = "orgs/"
+STAT_FILE_NAME = "load_report_graph.json"
+
+def _pythonMinioUrl(url):
+
+ if (url.endswith(".amazonaws.com")):
+ PYTHON_MINIO_URL = "s3.amazonaws.com"
+ else:
+ PYTHON_MINIO_URL = url
+ return PYTHON_MINIO_URL
+
+def getName(name):
+ return name.replace("orgs/","").replace(".nq","")
+# @asset(group_name="community")
+# def source_list(task_tenant_sources) -> Output(str):
+# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS)
+# orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)
+# sources = map( lambda f: { "name": getName(f.object_name)}, orglist )
+# source_json = json.dumps(list(sources))
+# os.makedirs("data", exist_ok=True)
+#
+# s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"source_list.json", source_json )
+# with open("data/source_list.json", "w") as f:
+# json.dump(list(sources), f)
+# return source_json
+#@asset(deps=[source_list])
+
+# set a prefix so we can have some named stats file
+
+#@asset( group_name="load")
+@asset(partitions_def=community_partitions_def,
+ deps=[AssetKey(["task","task_tenant_sources"])],
+ group_name="community",
+ key_prefix="task",
+ required_resource_keys={"triplestore"} )
+def loadstatsCommunity(context, task_tenant_sources) -> str:
+ prefix="history"
+ logger = get_dagster_logger()
+ s3_config = context.resources.triplestore.s3
+ s3Client = context.resources.triplestore.s3.s3.get_client()
+ s3Minio = s3.MinioDatastore(_pythonMinioUrl(s3_config.GLEANERIO_MINIO_ADDRESS), MINIO_OPTIONS)
+ # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False))
+ community_code= context.asset_partition_key_for_output()
+ stats = []
+ try:
+ ts = task_tenant_sources
+ t =list(filter ( lambda a: a['community']== community_code, ts["tenant"] ))
+ s = t[0]["sources"]
+
+ for source in s:
+ dirs = s3Minio.listPath(GLEANER_MINIO_BUCKET,path=f"{REPORT_PATH}{source}/",recursive=False )
+ for d in dirs:
+ latestpath = f"{REPORT_PATH}{source}/latest/"
+ if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False):
+ continue
+ path = f"{d.object_name}{STAT_FILE_NAME}"
+ s3ObjectInfo = {"bucket_name": GLEANER_MINIO_BUCKET, "object_name": path}
+ try:
+ # resp = s3Client.getFile(path=path)
+ resp = s3Minio.getFileFromStore(s3ObjectInfo)
+ stat = json.loads(resp)
+ stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count',
+ 'missing_sitemap_summon_count',
+ 'graph_urn_count', 'missing_summon_graph_count')
+ stats.append(stat)
+ except Exception as ex:
+ context.log.info(f"Failed to get source {source} for tennant {community_code} {ex}")
+ except Exception as ex:
+ context.log.info(f"Failed to get tenant {community_code} {ex}")
+ # for source in task_tenant_sources["tennant"]:
+ # try:
+ # # stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME )
+ # repo = community_code
+ # dirs = s3Minio.listPath( path=f"{REPORT_PATH}{repo}/",recursive=False )
+ # for d in dirs:
+ # latestpath = f"{REPORT_PATH}{repo}/latest/"
+ # if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False):
+ # continue
+ # path = f"/{d.object_name}{STAT_FILE_NAME}"
+ #
+ # try:
+ # resp = s3Minio.getFile(path=path)
+ # stat = json.loads(resp)
+ # stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count',
+ # 'missing_sitemap_summon_count',
+ # 'graph_urn_count', 'missing_summon_graph_count')
+ # stats.append(stat)
+ # except Exception as ex:
+ # logger.info(f"no missing graph report {source.get('name')} {ex}")
+ # except Exception as ex:
+ # logger.info(f"Failed to get { source.get('name')} {ex}")
+ context.log.info(stats)
+ df = pd.DataFrame(stats)
+ context.log.info(df)
+ # try:
+ # os.mkdir(f"data/{community_code}")
+ # except FileExistsError:
+ # logger.debug(f"directory data/{community_code} exists")
+ # except FileNotFoundError:
+ # logger.error(f"error creating directory. Fix community name. 'data/{community_code}' ")
+ #df.to_csv(f"data/{community_code}/all_stats.csv")
+
+ df_csv = df.to_csv()
+
+ # stringio = StringIO(df_csv)
+ # s3Client.upload_fileobj(stringio, s3_config.GLEANERIO_MINIO_BUCKET, f"data/{community_code}/all_stats.csv")
+ # humm, should we just have an EC utils resource
+ s3Minio.putReportFile(s3_config.GLEANERIO_MINIO_BUCKET, f"tenant/{community_code}", f"all_stats.csv", df_csv)
+ # with open(stringio, "rb") as f:
+ # s3.upload_fileobj(f, s3.GLEANERIO_MINIO_BUCKET, f"data/all/all_stats.csv")
+ context.log.info(f"all_stats.csv uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ")
+ #return df_csv # now checking return types
+
+ context.log.info(f"GLEANERIO_CSV_CONFIG_URL {GLEANERIO_CSV_CONFIG_URL} ")
+
+ report = generateReportStats(GLEANERIO_CSV_CONFIG_URL, s3_config.GLEANERIO_MINIO_BUCKET, s3Minio,
+ _graphSummaryEndpoint(community_code), community_code)
+ bucket, object = s3Minio.putReportFile(s3_config.GLEANERIO_MINIO_BUCKET, f"tenant/{community_code}",
+ f"report_stats.json", report)
+ context.log.info(
+ f"report_stats.json uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ")
+
+ return df_csv
diff --git a/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py b/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py
new file mode 100644
index 00000000..1ba501bd
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py
@@ -0,0 +1,22 @@
+from dagster import (
+ op, job, Config,
+ sensor, RunRequest, RunConfig,
+ SensorEvaluationContext, asset_sensor, EventLogEntry,
+ SkipReason,
+ AssetKey,
+ static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition,
+ define_asset_job, AssetSelection, graph_asset,
+ BackfillPolicy
+)
+from ..assets import task_tenant_sources
+
+from dagster_aws.s3.sensor import get_s3_keys
+from typing import List, Dict
+from pydantic import Field
+
+
+tenant_asset_job = define_asset_job(
+ name="task_tenant_config_updated_job",
+ selection=AssetSelection.assets(task_tenant_sources),
+
+)
diff --git a/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py b/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py
new file mode 100644
index 00000000..01fdfb45
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py
@@ -0,0 +1,62 @@
+import yaml
+from dagster import asset, get_dagster_logger, define_asset_job, ConfigurableResource
+from dagster_aws.s3 import S3Resource
+
+#from dagster import Field
+from pydantic import Field
+
+def _pythonMinioAddress(url, port=None):
+ if (url.endswith(".amazonaws.com")):
+ PYTHON_MINIO_URL = "s3.amazonaws.com"
+ else:
+ PYTHON_MINIO_URL = url
+ if port is not None:
+ PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
+ return PYTHON_MINIO_URL
+
+
+class gleanerS3Resource(ConfigurableResource):
+ s3: S3Resource
+ GLEANERIO_MINIO_BUCKET: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_ADDRESS: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_PORT: str = Field(
+ description="GLEANERIO_MINIO_BUCKET.")
+ GLEANERIO_MINIO_USE_SSL: bool = Field(
+ default=False)
+ GLEANERIO_CONFIG_PATH : str = Field(
+ description="GLEANERIO_CONFIG_PATH.", default="scheduler/configs/test/")
+ GLEANERIO_TENANT_FILENAME : str = Field(
+ description="GLEANERIO_TENANT_CONFIG.", default="tenant.yaml")
+ # now using the boto s3 embedded in dagster_aws, but just in case we need them
+ GLEANERIO_MINIO_ACCESS_KEY: str = Field(
+ description="GLEANERIO_MINIO_ACCESS_KEY")
+ GLEANERIO_MINIO_SECRET_KEY: str = Field(
+ description="GLEANERIO_MINIO_SECRET_KEY")
+## https://docs.dagster.io/_apidocs/libraries/dagster-aws#s3
+# fields from dagster_aws.s3.S3Resource
+# region_name
+# endpoint_url
+# use_ssl
+# aws_access_key_id
+# aws_secret_access_key
+ def listPath(self, path='orgs'):
+ return self.s3.get_client().list_objects(
+ Bucket=self.GLEANERIO_MINIO_BUCKET,
+ Prefix=path,
+
+ )["Contents"]
+
+ def getTennatInfo(self, path='orgs'):
+ path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_TENANT_FILENAME}"
+ try:
+ r = self.s3.get_client().get_object(
+ Bucket=self.GLEANERIO_MINIO_BUCKET,
+ Key=path,
+ )
+ return yaml.safe_load(r["Body"])
+ except Exception as ex:
+ get_dagster_logger().info(f"tennant file {path} not found in bucket {self.GLEANERIO_MINIO_BUCKET} at {self.GLEANERIO_MINIO_ADDRESS} ")
+ raise ex
+ #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT)
diff --git a/dagster/implnets/workflows/tasks/tasks/resources/graph.py b/dagster/implnets/workflows/tasks/tasks/resources/graph.py
new file mode 100644
index 00000000..44f09303
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/resources/graph.py
@@ -0,0 +1,138 @@
+import os
+from typing import Any, Dict
+
+import pydash
+from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger
+
+#from dagster import Field
+from pydantic import Field
+import requests
+from .gleanerS3 import gleanerS3Resource
+#Let's try to use dasgeter aws as the minio configuration
+
+# class AirtableConfig(Config):
+# DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml")
+#
+# # Vars and Envs
+# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio")
+# # env items
+# URL = os.environ.get('PORTAINER_URL')
+# APIKEY = os.environ.get('PORTAINER_KEY')
+# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5)
+#
+# Let's try to use dasgeter aws as the minio configuration
+# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS'))
+# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT'))
+# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL')))
+# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY'))
+# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY'))
+# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET'))
+#
+# # set for the earhtcube utiltiies
+# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL
+#
+# ,"access_key": GLEANER_MINIO_ACCESS_KEY
+# ,"secret_key": GLEANER_MINIO_SECRET_KEY
+# }
+#
+# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222"))
+# # using GLEANER, even though this is a nabu property... same prefix seems easier
+# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL'))
+# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE'))
+# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml"))
+# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml"))
+# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest'))
+# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest'))
+# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner
+# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz'))
+# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/'))
+# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz'))
+# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/'))
+# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner'))
+# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu'))
+# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT')
+# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" )
+#
+# SUMMARY_PATH = 'graphs/summary'
+# RELEASE_PATH = 'graphs/latest'
+
+
+class GraphResource(ConfigurableResource):
+ GLEANERIO_GRAPH_URL: str = Field(
+ description="GLEANERIO_GRAPH_URL.")
+ GLEANERIO_GRAPH_NAMESPACE: str = Field(
+ description="GLEANERIO_GRAPH_NAMESPACE.")
+ GLEANERIO_GRAPH_SUMMARY_NAMESPACE: str = Field(
+ description="GLEANERIO_GRAPH_SUMMARY_NAMESPACE.")
+ GLEANERIO_GRAPH_SUMMARIZE: str = Field(
+ description="GLEANERIO_GRAPH_SUMMARIZE.")
+ s3: gleanerS3Resource
+
+# need multiple namespaces. let's do this.
+ def GraphEndpoint(self, namespace):
+ url = f"{self.GLEANERIO_GRAPH_URL}/namespace/{namespace}/sparql"
+ return url
+
+ def PythonMinioAddress(url, port=None):
+
+ if (url.endswith(".amazonaws.com")):
+ PYTHON_MINIO_URL = "s3.amazonaws.com"
+ else:
+ PYTHON_MINIO_URL = url
+ if port is not None:
+ PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}"
+ return PYTHON_MINIO_URL
+ def post_to_graph(self, source, path='graphs/latest', extension="nq", graphendpoint=None):
+ if graphendpoint is None:
+ graphendpoint = self.GraphEndpoint()
+ # revision of EC utilities, will have a insertFromURL
+ #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') )
+ proto = "http"
+# this need to get file from s3.
+
+ if self.GLEANERIO_MINIO_USE_SSL:
+ proto = "https"
+ port = self.GLEANERIO_MINIO_PORT
+ address = self.PythonMinioAddress(self.GLEANERIO_MINIO_ADDRESS, self.GLEANERIO_MINIO_PORT)
+ bucket = self.GLEANERIO_MINIO_BUCKET
+ release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}"
+ # BLAZEGRAPH SPECIFIC
+ # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
+ # get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
+ # r = requests.post(url)
+ # log.debug(f' status:{r.status_code}') # status:404
+ # get_dagster_logger().info(f'graph: insert: status:{r.status_code}')
+ # if r.status_code == 200:
+ # # ''
+ # if 'data modified="0"' in r.text:
+ # get_dagster_logger().info(f'graph: no data inserted ')
+ # raise Exception("No Data Added: " + r.text)
+ # return True
+ # else:
+ # get_dagster_logger().info(f'graph: error')
+ # raise Exception(f' graph: insert failed: status:{r.status_code}')
+
+ ### GENERIC LOAD FROM
+ url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}"
+ get_dagster_logger().info(f'graph: insert "{source}" to {url} ')
+ loadfrom = {'update': f'LOAD <{release_url}>'}
+ headers = {
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ }
+ r = requests.post(url, headers=headers, data=loadfrom )
+ get_dagster_logger().debug(f' status:{r.status_code}') # status:404
+ get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}')
+ if r.status_code == 200:
+ get_dagster_logger().info(f'graph load response: {str(r.text)} ')
+ # ''
+ if 'mutationCount=0' in r.text:
+ get_dagster_logger().info(f'graph: no data inserted ')
+ #raise Exception("No Data Added: " + r.text)
+ return True
+ else:
+ get_dagster_logger().info(f'graph: error {str(r.text)}')
+ raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}')
+
+class BlazegraphResource(GraphResource):
+ pass
+
diff --git a/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py b/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py
new file mode 100644
index 00000000..17ad93c7
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py
@@ -0,0 +1,46 @@
+from dagster import(
+ op, job, Config, get_dagster_logger,
+ sensor, RunRequest, RunConfig,
+ SensorEvaluationContext, asset_sensor, EventLogEntry,
+ SkipReason,
+ AssetKey,
+ static_partitioned_config,
+ DefaultSensorStatus
+)
+from dagster_aws.s3.sensor import get_s3_keys
+
+from ..jobs.jobs import tenant_asset_job
+
+@sensor(name="s3_config_source_sensor",
+ default_status=DefaultSensorStatus.RUNNING,
+ #, job_name="sources_updated_job",
+ job=tenant_asset_job,
+ required_resource_keys={"s3"},
+ # minimum_interval_seconds=3600
+ )
+def tenant_s3_sensor(context
+ ):
+
+ gleaner_s3 = context.resources.s3
+
+ since_key = context.cursor or None
+ get_dagster_logger().info(f"sinceKey: {since_key}")
+ config_path=(f"{gleaner_s3.GLEANERIO_CONFIG_PATH}")
+ filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_TENANT_FILENAME}"
+ new_s3_keys = gleaner_s3.s3.get_client().head_object(
+ Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET,
+ Key=filename,
+
+ )
+ if not new_s3_keys:
+ return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}")
+ get_dagster_logger().info(f"metadata {new_s3_keys}")
+ #new_s3_keys = list(new_s3_keys)
+ last_key = str(new_s3_keys['LastModified'])
+ get_dagster_logger().info(f"last_modified: {last_key}")
+ run_requests =[]
+ if since_key is None or since_key < last_key:
+ #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys]
+ run_requests = [RunRequest(run_key=last_key, run_config={})]
+ context.update_cursor(last_key)
+ return run_requests
diff --git a/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py b/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py
new file mode 100644
index 00000000..304df6ed
--- /dev/null
+++ b/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py
@@ -0,0 +1,23 @@
+from dagster import schedule, RunRequest, ScheduleEvaluationContext, define_asset_job, AssetSelection
+
+load_analytics_job = define_asset_job("load_analytics_job", selection=AssetSelection.groups("load"))
+graph_analytics_job = define_asset_job("graph_analytics_job", selection=AssetSelection.groups("graph"))
+
+@schedule(job=load_analytics_job, cron_schedule="@weekly")
+def loadstats_schedule(context: ScheduleEvaluationContext):
+
+ return RunRequest(
+ run_key=None,
+ run_config={}
+
+ )
+
+@schedule(job=graph_analytics_job, cron_schedule="@weekly")
+def all_graph_stats_schedule(context: ScheduleEvaluationContext):
+
+ return RunRequest(
+ run_key=None,
+ run_config={}
+
+ )
+
diff --git a/docs/README.md b/docs/README.md
index d759a0e2..cff15560 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,4 +1,4 @@
-# Dagster
+# Scheduler, AKA Dagster
## About
@@ -15,142 +15,370 @@ basic view and doesn't present any scaling or fail over elements.
The key elements are:
-* sources to configuration and then the creation of the archive files that are loaded and used
-to load into the Gleaner and Nabu tools
+* sources to configuration to load into the Gleaner and Nabu tools, and push to the triplestore. These are now stored in
+an s3 location
+ * gleaner configuration. a list of sources to load. (NOTE: This is also a docker config that needs to be updated to mactch to make things work)
+ * tenant configuration. a list communities, and which sources they load
* The Dagster set which loads three containers to support workflow operations
* The Gleaner Architecture images which loads three or more containers to support
* s3 object storage
* graph database (triplestore)
* headless chrome for page rendering to support dynamically inserted JSON-LD
* any other support packages like text, semantic or spatial indexes
-* The GleanerIO tools which loads two containers as services (Gleaner and Nabu) that are run
-and removed by the Dagster workflow
-![upper level](images/gleanerDagster.svg)
+### WORKFLOWS
+
+There are three workflows
+* ingest works to load sources
+* tasks weekly task
+* ecrr - loads Earthcube Resource Registry
+
+
+
+```mermaid
+---
+title: Dagster Stack
+---
+flowchart LR
+ subgraph DockerCompose[Docker Compose Stacks]
+ maincompose[dagster/implents/deployment/compose_project.yaml]
+ project_overrides[dagster/implnets/deployment/compose_project_eco_override.yaml]
+ end
+
+ subgraph Config
+ subgraph s3
+ gleanconfig[gleanerconfig.yaml]
+ tenant[tenant.yaml]
+
+ end
+
+ subgraph Dagster/Config
+ workflow[ eco-wf ]
+ container-config-gleaner[gleanerio contaianer config]
+ container-config-nabu[gleanerio container config for nabu]
+ end
+ env['environment variables']
+
+ end
+ subgraph docker[docker managed by portainer]
+
+ subgraph Containers
+ dagit
+ dagster
+ postgres
+ ingest
+ tasks
+ ecrr
+ end
+ config
+ subgraph Volumes
+ dagster-postgres
+ end
+ end
+ postgres--uses-->dagster-postgres
+ dagster--uses-->workflow
+ dagit--uses-->workflow
+ workflow-->config
+ maincompose--deploys-->dagit[dagster webserver]
+ maincompose--deploys-->dagster[dagster main]
+ maincompose--deploys-->ingest[gleanerio ingest code]
+ maincompose--deploys-->tasks[gleanerio task code]
+ project_overrides--deploys-->ecrr[earthcube code]
+ ingest--reads-->gleanconfig
+ ingest--reads-->tenant
+ tasks--reads-->gleanconfig
+ tasks--reads-->gleanconfig
+ dagster--uses-->postgres
+
+```
+#### basic deployment
+
+1. information for environment variables is created
+2. The configuration files are created and loaded to s3, and docker/config
+2. a docker stack is created, and the environment variables are added.
+3. portainer deploys containers
+4. when ingest and tasks are executed, they read
+
+
+#### Ingest Workflow
+```mermaid
+---
+title: Ingest Workflow Sequence
+---
+sequenceDiagram
+ participant S3
+ participant Ingest
+ participant Portainer
+ participant Graph
+ S3->>Ingest: read sources from scheduler/configs/gleanerconfig.yaml
+ S3->>Ingest: read tenant from scheduler/configs/tenant.yaml
+ Ingest-->Ingest: create gleanerio container
+ Ingest->>Portainer: run gleanerio
+ Portainer-->Portainer: docker configs mounted in gleanerio container
+ Portainer-->Portainer: summon for sources
+ Portainer->>S3: jsonld to s3
+ Portainer->>Ingest: logs returned
+ Ingest->>S3: logs from run to S3
+ Ingest->>Ingest: create load reports using EC Utils
+ Ingest->>S3: load reports to s3
+ Ingest->>Portainer: run nabu to
+ Portainer-->Portainer: convert jsonld to release and release summary
+ Portainer->>S3: release and release summary to s3
+ Ingest->>Ingest: create graph report using EC Utils
+ Ingest->>S3: graph report to s3
+ Ingest->>Graph: Create a namespaces for tenant
+ Ingest->>Graph: load release and release summary to namespaces
+```
-### Template files
-
-The template files define the Dagster Ops, Jobs and Schedules. From these
-and a GleanerIO config file a set of Python scripts for Dagster are created in
-the output directory.
-
-These only need to be changed or used to regenerate if you wish to alter the
-execution graph (ie, the ops, jobs and schedules) or change the config file.
-In the later case only a regeneration needs to be done.
+```mermaid
+---
+title: Ingest Simplified Flowchart
+---
+flowchart LR
+ subgraph config
+ s3_config_sensors
+ end
+ subgraph jobs
+ summon_and_release
+ tenant_release
+ end
+ subgraph assets
+ sources
+ tenants
+ end
+
+
+
+ s3_config_sensors--monitors --> configs
+ s3_config_sensors--writes -->sources
+ s3_config_sensors--writes -->tenants
+ summon_and_release--uses-->sources --runs --> gleanerio
+ tenant_release--uses-->tenants --runs --> tenant_release
+ gleanerio--stores JSONLD -->summon
+ gleanerio--stores log -->logs
+ summon_and_release-- reads --> summon
+ summon_and_release-- converts to graph -->graph_path
+ tenant_release -- monitors --> graph_path
+ tenant_release -- loads releases to --> tenant_namespace
+ tenant_release -- loads releases to --> tenant_summary_namespace
+
+
+ subgraph portainer
+ gleanerio
+ tenant_ui
+ end
+ subgraph services
+ triplestore
+ tenant_namespace
+ tenant_summary_namespace
+ end
+
+ subgraph minio_s3
+ subgraph bucket_paths
+ subgraph scheduler
+ configs["`scheduler/configs`"]
+ logs
+ end
+ summon
+ graph_path['graph']
+ end
+ end
+
+
+
+
+
+```
-There are then Docker build scripts to build out new containers.
+#### Task workflows
+```mermaid
+---
+title: Task Workflow Sequence
+---
+sequenceDiagram
+ participant S3
+ participant Ingest
+ participant Portainer
+ participant Graph
+ Ingest->>Ingest: all_graph_stats assets: graph statistics using EC Utils
+ Ingest->>S3: load all_graph_stats to s3
+ Ingest->>Ingest: source_stats assets: loadstatsHistory using EC Utils
+ Ingest->>Graph: sparql query to get graph stats
+ Graph->>Ingest: results for source_stats
+ Ingest->>S3: source_stats to s3
+
+```
-See: [template](./implnets/src/implnet-example/templates)
## Steps to build and deploy
-The deployment can be tested locally using docker.
-The production 'containers' are built with a github action, or using a makefile.
+The deployment can be tested locally. You can setup a services stack in docker to locally test, or use existing
+services.
+
+The production 'containers' dagster, gleaner, and nabu are built with a github action. You can also use a makefile.
This describes the local and container deployment
We use portainer to manage our docker deployments.
-
-1) move to the the deployment directory
-2) copy the envFile.env to .env
+## Server Deployment.
+ [Production example for Earthcube](eco_deploy.md)
+
+## DEVELOPER Pycharm -- Run local with remote services
+You can test components in pycharm. Run configurations for pycgharm are in runConfigurations (TODO: Instructions)
+use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile)
+![pycharm runconfig](images/pycharm_runconfig.png)
+1) move to the implnets/deployment directory
+2) copy the envFile.env to .env [see](#environment-files) use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile)
+3) edit the entries to point at a portainer/traefik with running services
+4) edit configuration files in implnets/configs/PROJECT: gleanerconfig.yaml, tenant.yaml
+5) upload configuration implnets/configs/PROJECT to s3 scheduler/configs: gleanerconfig.yaml, tenant.yaml
+4) run a Pycharm runconfig
+ 5) eg dagster_ingest_debug
+4) go to http://localhost:3000/
+6) you can [test the schedules](#test-schedules)
+
+## full stack test Run local with remote services
+1) move to the implnets/deployment directory
+2) copy the envFile.env to .env [see](#environment-files)use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile) [see](#environment-files) use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile)
3) edit the entries.
+4) edit configuration files in implnets/configs/PROJECT to s3: gleanerconfig.yaml, tenant.yaml
+5) upload configuration implnets/configs/PROJECT to scheduler/configs s3: gleanerconfig.yaml, tenant.yaml
4) for local, `./dagster_localrun.sh`
5) go to http://localhost:3000/
To deploy in portainer, use the deployment/compose_project.yaml docker stack.
### docker compose Configuration:
-1) there are three files that need to be installed into docker configs.
-
-| file | local | stack | note |
-|--------------------|-------------------------------------| ------ |--------------------------|
-| workspace | configs/PROJECT/worksapce.yaml | env () | used by dagster |
-| gleanerconfig.yaml | configs/PROJECT/gleanerconfigs.yaml | env () | needs to be in portainer |
-| nabuconfig.yaml | configs/PROJECT/nabuconfigs.yaml | env () | needs to be in portainer |
-2)
-
-## Editing Template
-
-you can edit implnets/template
-
-then deploy with
-
-`pygen.py -cf ./configs/eco/gleanerconfig.yaml -od ./generatedCode/implnet-eco/output -td ./templates/v1 -d 7 ``
-
-If you are running using dagster_localrun.sh
-1) go to the deployment at http://localhost:3000/locations
-2) click 'reload on gleaner@project_grpc'
-3) then if code is correct, then you will be able run the changed [workflows](http://localhost:3000/overview/jobs)
-
-(TODO NEEDS MORE
-)
-
-## MAKEFILE
-1) Place your gleanerconfig.yaml (use that exact name) in _confgis/NETWORK/gleanerconfig.yaml_
- 1) Note: When doing your docker build, you will use this NETWORK name as a value in the command such as
- ```bash
- podman build --tag="docker.io/fils/dagster_nsdf:$(VERSION)" --build-arg implnet=nsdf --file=./build/Dockerfile
- ```
-1) Make any needed edits to the templates in directory _templates/v1/_ or make your own template set in that directory
-
-The command to build using the pygen.py program follows. This is done from the standpoint of running in from the
-implenet directory.
-
-```bash
- python pygen.py -cf ./configs/nsdf/gleanerconfig.yaml -od ./generatedCode/implnet-nsdf/output -td ./templates/v1 -d 7
-```
-
-1) This will generate the code to build a dagster instance from the combination of the templates and gelanerconfig.yaml.
-2)
-
-
-
-
+there are configuration files that are needed.
+They are installed in two places:
+* as docker configs
+* as scheduler configs in S3
+
+ (NOTE: I think the configs are still needed in the containers)
+
+| file | local | | note |
+|--------------------|------------------------------------------|---------------------------------------------------|-----------------------------------------|
+| workspace | configs/PROJECT/worksapce.yaml | dockerconfig: workspace | docker compose: used by dagster |
+| gleanerconfig.yaml | configs/PROJECT/gleanerconfig.yaml | s3:{bucket}/scheduler/configs/gleanerconfigs.yaml | ingest workflow needs to be in minio/s3
+| tenant.yaml | configs/PROJECT/tenant.yaml | s3:{bucket}/scheduler/configs/tenant.yaml | ingest workflow needs to be in minio/s3
+| dagster.yaml | dagster/implnets/deployment/dagster.yaml | dockerconfig: dagster | docker compose: used by dagster
+| gleanerconfig.yaml | configs/PROJECT/gleanerconfig.yaml | dockerconfig: gleaner | mounted in gleaner docker container
+| nabuconfig.yaml | configs/PROJECT/nabuconfig.yaml | dockerconfig: nabu | mounted in gleaner docker container
+
+(NOTE: This is also a gleaner config (below in runtime configuration) that needs to be updated to mactch to make things work)
+
+[Docker Configs for gleanerio containers ](https://github.com/earthcube/scheduler/issues/106) are still needed:
+
+| file | local | stack | note |
+|---------------------|-----------------------------------------------------------| ------ |---------------------------------------|
+| gleanerconfig.yaml | configs/PROJECT/gleanerconfigs.yaml | env () | generated code needs to be in ~~portainer~~ |
+| nabuconfig.yaml | configs/PROJECT/nabuconfigs.yaml | env () | generated codeneeds to be in ~~portainer~~ |
+
+3) when the containers are running in a stack, on portainer, there will need to
+ be updated by pulling from dockerhub. The ENV variables may need to be updated for the CONTAINER*_TAG
+
+
+## Runtime configuration
+
+### upload to an s3 bucket
+
+| file | local | | note |
+|--------------------|---------------------------------------------------| ------ |---------------------------------------|
+| gleanerconfig.yaml | s3:{bucket}/scheduler/configs/gleanerconfigs.yaml | | ingest workflow needs to be in minio/s3
+| tenant.yaml | s3:{bucket}/scheduler/configs/enant.yaml | | ingest workflow needs to be in minio/s3
+
+### updating config
+You can update a config, and a sensor should pick up the changes.
+1) Upload changed file to s3
+ 2) note, if this is a new source, you need to add it to the docker config (gleaner-PROJECT).
+2) go to overview, ![overview](images/overview_sensors_tab.png)
+3) go to s3_config_source_sensor for gleanerconfig.yaml changes, and s3_config_tenant_sensor for tenant.yaml changes
+ ![sensor](images/sources_sensor.png).
+4) at some point, a run should occur. ![run](images/runs.png).
+5) then go to the sources_sensor, or tenant sensor
+if job does not run, you can do a backfill.
+#### new sources:
+6) so to job tab, and run summon_and_release with the 'partitions' aka 'sources' that are recent.
+7) click materialize_all, and in the backfill dialog be sure only the added partition is selected. ![backfill](images/materialize.png).
+8) go to runs, and see that a job with a partition with that name is queued/running
+9) run tenant_release_job with same partition name to load data to tenants
+###
+#### new tenants:
+There are two jobs that need to run to move data to a tenant. (third will be needed for UI)
+6) so to job tab, and run tenant_namespaces_job with the 'partitions' aka 'tenant' that are recent.'
+7) click materialize_all, and be sure only the added partition is selected
+8) go to runs, and see that a job with a partition with that name is queded,/running
+6) so to job tab, and run tenant_release_job with the 'partitions' aka 'sources' for that tenant
+7) click materialize_all, The data will be pushed to all tenant namespaces
+
+## test schedules
+
+![schedules tab](images/schedules_tab.png)
+![schedules example](images/schedules_example.png)
+![schedules select](images/schedules_select.png)
+![schedules test](images/schedules_test.png)
### Environment files
1) cp deployment/envFile.env .env
2) edit
3) `export $(cat .env | xargs)`
export $(cat .env | xargs)
-``` bash
+```yaml
######
# Nabu and Gleaner configs need to be in docker configs
## docker config name GLEANER_GLEANER_DOCKER_CONFIG
## docker config name GLEANER_NABU_DOCKER_CONFIG
# suggested DOCKER_CONFIG NAMING PATTERN (nabu||gleaner)-{PROJECT}
########
-GLEANERIO_GLEANER_DOCKER_CONFIG=gleaner-eco
-GLEANERIO_NABU_DOCKER_CONFIG=nabu-eco
+GLEANERIO_DOCKER_GLEANER_CONFIG=gleaner-eco
+GLEANERIO_DOCKER_NABU_CONFIG=nabu-eco
# ###
# workspace for dagster
####
GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml
-GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco
+GLEANERIO_DOCKER_WORKSPACE_CONFIG=workspace-eco
+
+
+
+DEBUG_CONTAINER=false
+
+#### HOST
+# host base name for treafik. fixed to localhost:3000 when using compose_local.
+HOST=localhost
+# Applies only to compose_project.yaml runs
+# modify SCHED_HOSTNAME is you want to run more than one instance
+# aka two different project havests for now.
+SCHED_HOSTNAME=sched
-# NETWORK is needed for headless rendering
-# gleaner
+GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=300
+# debugging set to 10 - 30 seconds
-DEBUG=False
PROJECT=eco
#PROJECT=iow
#PROJECT=oih
-HOST=localhost
+# tags for docker compose
+CONTAINER_CODE_TAG=latest
+CONTAINER_DAGSTER_TAG=latest
+
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
# port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/
-PORTAINER_URL=
-PORTAINER_KEY=
+GLEANERIO_DOCKER_URL=https://portainer.{HOST}:443/api/endpoints/2/docker/
+GLEANERIO_PORTAINER_APIKEY=
+# if running dagster-dev, then this needs to be set ,
+# defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers
+# when debugging generated code "../../../configs/eco/gleanerconfig.yaml"
+# when debugging code in workflows "../../configs/eco/gleanerconfig.yaml"
+GLEANERIO_DAGSTER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml
# Network
-GLEANERIO_HEADLESS_NETWORK=headless_gleanerio
+GLEANERIO_DOCKER_HEADLESS_NETWORK=headless_gleanerio
### GLEANER/NABU Dockers
-GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest
-GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest
-
-
+GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:dev_ec
+GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:dev_eco
##
# path where configs are deployed/mounted
@@ -158,12 +386,8 @@ GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest
GLEANERIO_GLEANER_CONFIG_PATH=/gleaner/gleanerconfig.yaml
GLEANERIO_NABU_CONFIG_PATH=/nabu/nabuconfig.yaml
###
-
-
-
-
-
-#GLEANERIO_LOG_PREFIX=scheduler/logs/
+#path in s3 for docker log files
+GLEANERIO_LOG_PREFIX=scheduler/logs/
GLEANERIO_MINIO_ADDRESS=
GLEANERIO_MINIO_PORT=80
@@ -173,100 +397,32 @@ GLEANERIO_MINIO_ACCESS_KEY=
GLEANERIO_MINIO_SECRET_KEY=
GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222
-
-
# just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph
-GLEANERIO_GRAPH_URL=
-GLEANERIO_GRAPH_NAMESPACE=
-
-
-```
-
-# Implementation Networks
-
-This ([https://github.com/sharmasagar25/dagster-docker-example](https://github.com/sharmasagar25/dagster-docker-example))
-is an example on how to structure a [Dagster] project in order to organize
-the jobs, repositories, schedules, and ops. The example also contains
-examples on unit-tests and a docker-compose deployment file that utilizes a
-Postgresql database for the run, event_log and schedule storage.
-
-This example should in no way be considered suitable for production and is
-merely my own example of a possible file structure. I personally felt that it
-was difficult to put the Dagster concepts to use since the projects own examples
-had widely different structure and was difficult to overview as a beginner.
-
-The example is based on the official [tutorial].
-
-## Folders
+GLEANERIO_GRAPH_URL=https://graph.geocodes-aws.earthcube.org/blazegraph
+GLEANERIO_GRAPH_NAMESPACE=mytest
-* build: build directives for the docker containers
-* configs
-* src
-* tooling
+# optional: GLEANERIO_GRAPH_SUMMARY_ENDPOINT defaults to GLEANERIO_GRAPH_URL
+#GLEANERIO_GRAPH_SUMMARY_ENDPOINT=https://graph.geocodes-aws-dev.earthcube.org/blazegraph
+GLEANERIO_GRAPH_SUMMARY_NAMESPACE=mytest_summary
+GLEANERIO_GRAPH_SUMMARIZE=True
-## Requirements
+# where are the gleaner and tennant configurations
+GLEANERIO_CONFIG_PATH="scheduler/configs/"
+GLEANERIO_TENANT_FILENAME="tenant.yaml"
+GLEANERIO_SOURCES_FILENAME="gleanerconfig.yaml"
-At this point it is expected that you have a valid Gleaner config file named
-_gleanerconfig.yaml_ located in some path within the _configs_ directory.
+# ECO Custom variables for ecrr
+ECRR_GRAPH_NAMESPACE=ecrr
+ECRR_MINIO_BUCKET=ecrr
-## Building the dagster code from templates
+# only a public slack channel works. DV has no permissions to create a new channel
+#SLACK_CHANNEL="#production_discussion"
+SLACK_CHANNEL="#twitterfeed"
+SLACK_TOKEN=
-The python program pygen will read a gleaner configuration file and a set of
-template and build the Dagster code from there.
-
-```bash
-python pygen.py -cf ./configs/nsdf/gleanerconfig.yaml -od ./src/implnet-nsdf/output -td ./src/implnet-nsdf/templates -d 7
-```
-
-
-## Running
-
-There is an example on how to run a single pipeline in `src/main.py`. First
-install the dependencies in an isolated Python environment.
-
-```bash
-pip install -r requirements
```
-The code built above can be run locally, though your templates may be set up
-to reference services and other resources not present on your dev machine. For
-complex examples like these, it can be problematic.
-
-If you are looking for some simple examples of Dagster, check out the directory
-examples for some smaller self-contained workflows. There are good for testing
-things like sensors and other approaches.
-
-If you wish to still try the generated code cd into the output directory
-you specified in the pygen command.
-Then use:
-
-```bash
-dagit -h ghost.lan -w workspace.yaml
-```
-
-## Building
-
-```bash
- podman build -t docker.io/fils/dagster:0.0.24 .
-```
-
-```bash
- podman push docker.io/fils/dagster:0.0.24
-```
-
-
-
-# Appendix
-
-## Setup
-
-
-![orchestration](images/orchestrationInit.svg)
-
-## Docker API sequence
-
-![sequence](../docs/images/sequence.svg)
## Appendix
@@ -278,27 +434,47 @@ at the documentation for [Accessing the Portainer API](https://docs.portainer.io
## Notes
-Single file testing run
-```bash
- dagit -h ghost.lan -f test1.py
-```
+### Handle Multiple Organizations
-* Don't forget to set the DAGSTER_HOME dir like in
+thoughts...
-```bash
- export DAGSTER_HOME=/home/fils/src/Projects/gleaner.io/scheduler/python/dagster
-```
+* Each organization can be in a container with its own code workflow.
+ * in the workflows directory: `dagster project projectname`
+* If we can standardize the loading and transforming workflows as much as possible, then the graph loading workflows
+ should be [standardized](https://github.com/earthcube/scheduler/issues/142). We could just define an additional container in a compose file, and add that to the workflows
```
-dagster-daemon run
-```
-
-Run from directory where workspace.yaml is.
-```
-dagit --host 192.168.202.159
+load_from:
+# - python_file:
+# relative_path: "project/eco/repositories/repository.py"
+# location_name: project
+# working_directory: "./project/eco/"
+# - python_file:
+# relative_path: "workflows/ecrr/repositories/repository.py"
+# working_directory: "./workflows/ecrr/"
+ # module starting out with the definitions api
+ # - python_module: "workflows.tasks.tasks"
+
+ - grpc_server:
+ host: dagster-code-tasks
+ port: 4000
+ location_name: "tasks"
+ - grpc_server:
+ host: dagster-code-eco-ingest
+ port: 4000
+ location_name: "ingest"
+ - grpc_server:
+ host: dagster-code-ios-ingest
+ port: 4000
+ location_name: "ingest"
+ - grpc_server:
+ host: dagster-code-eco-ecrr
+ port: 4000
+ location_name: "ecrr"
```
+* to add a container, you need to edit the workflows.yaml in an organizations configuration
## Cron Notes
@@ -336,7 +512,4 @@ We can then use the docker approach
to run indexes on specific sources in these configuration files.
-## References
-
-* [Simple Dagster example](https://bakerwho.github.io/posts/datascience/Deployable-Dagster-MVP/)
diff --git a/docs/README_LOCAL_DEVELOPMENT.md b/docs/README_LOCAL_DEVELOPMENT.md
index eb0b3c80..730ad7d4 100644
--- a/docs/README_LOCAL_DEVELOPMENT.md
+++ b/docs/README_LOCAL_DEVELOPMENT.md
@@ -52,17 +52,7 @@ You need to set the environment based on dagster/implnets/deployment/envFile.env
will run just the task, and in editable form, i think.
-### testing generated code
-`cd generatedCode/PROJECT/output/`
-
-`export $(sed '/^[ \t]*#/d' ../../../deployment/.env | sed '/^$/d' | xargs)`
-
-`dagster dev`
-
-??? note
- YOU CANNOT SET BREAKPOINTS IN TEMPLATES
- YOU NEED TO cd generatedCode/PROJECT/output/jobs and set them in the job you are testing.
## TESTING CONTAINERS
diff --git a/docs/add_containers.md b/docs/add_containers.md
deleted file mode 100644
index 64e9f1e7..00000000
--- a/docs/add_containers.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Containers for Dagster Scheduler
-
-## Things to work on
-* secrets
-* modify compose to use a project variable
-* network modify to use passed evn for network names
-
-Future:
-* can we use a volume. Use git to pull?
-
-## Build Docker
-Needs to be automated with a workflow
-
-something about archive file
-
-## add Stack.
- dagster/deployment/compose.yaml make usre that it is the correct version.
- image: docker.io/fils/dagster_eco:0.0.44
-
-NOTE: we can use a env ${project} varibale like we do for geocodes_.._named.yaml
-
-
diff --git a/docs/developement.md b/docs/developement_uisng_generated_code.md
similarity index 88%
rename from docs/developement.md
rename to docs/developement_uisng_generated_code.md
index a29e5d1d..8823b858 100644
--- a/docs/developement.md
+++ b/docs/developement_uisng_generated_code.md
@@ -1,5 +1,9 @@
# Scheduler Developement in Dagster
+ **NOTE: originally, a set of workflows was generated for each source. These were compiled into separate 'project' containers by a github workflo
+This is no longer needed. But these were the original instructions.**
+
+
!!! Note
add [envfile](https://plugins.jetbrains.com/plugin/7861-envfile) plug in to PyCharm to allow for easy debugging to code
@@ -102,12 +106,12 @@ GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml
GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco
-# NETWORK is needed for headless rendering
-# gleaner
-
-
DEBUG=False
+GLEANERIO_CONTAINER_WAIT_SECONDS=300
+# debuggin set to 5 or 10 seconds
PROJECT=eco
+CONTAINER_CODE_TAG=latest
+CONTAINER_DAGSTER_TAG=latest
#PROJECT=iow
#PROJECT=oih
HOST=localhost
@@ -115,7 +119,13 @@ PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
# port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/
PORTAINER_URL=
PORTAINER_KEY=
-
+# if running dagster-dev, then this needs to be set ,
+# defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers
+# when debugging generated code "../../../configs/eco/gleanerconfig.yaml"
+# when debugging code in workflows "../../configs/eco/gleanerconfig.yaml"
+# DAGSTER_GLEANER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml
+GLEANERIO_CONTAINER_WAIT_SECONDS=3600
+#GLEANERIO_CONTAINER_WAIT_SECONDS=30
# Network
GLEANERIO_HEADLESS_NETWORK=headless_gleanerio
@@ -123,20 +133,14 @@ GLEANERIO_HEADLESS_NETWORK=headless_gleanerio
GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest
GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest
-
-
##
# path where configs are deployed/mounted
####
GLEANERIO_GLEANER_CONFIG_PATH=/gleaner/gleanerconfig.yaml
GLEANERIO_NABU_CONFIG_PATH=/nabu/nabuconfig.yaml
###
-
-
-
-
-
-#GLEANERIO_LOG_PREFIX=scheduler/logs/
+#path in s3 for docker log files
+GLEANERIO_LOG_PREFIX=scheduler/logs/
GLEANERIO_MINIO_ADDRESS=
GLEANERIO_MINIO_PORT=80
@@ -146,12 +150,14 @@ GLEANERIO_MINIO_ACCESS_KEY=
GLEANERIO_MINIO_SECRET_KEY=
GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222
-
-
# just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph
GLEANERIO_GRAPH_URL=
GLEANERIO_GRAPH_NAMESPACE=
+# example: https://graph.geocodes.ncsa.illinois.edu/blazegraph/namespace/yyearthcube2/sparql
+#graph endpoint will be GLEANERIO_GRAPH_URL
+GLEANERIO_SUMMARY_GRAPH_NAMESPACE=
+GLEANERIO_SUMMARIZE_GRAPH=True
```
diff --git a/docs/eco_deploy.md b/docs/eco_deploy.md
new file mode 100644
index 00000000..888e15f3
--- /dev/null
+++ b/docs/eco_deploy.md
@@ -0,0 +1,77 @@
+# ECO Scheduler Notes
+
+!!! Note
+ these will need to become the gleanerio scheduler documentation.
+ for now these are rough. Images and graphics need to be loaded
+
+```mermaid
+flowchart TB
+Postgres_Container-- defined by --> compose_project
+Dagit_UI_Container-- defined by --> compose_project
+Dagster_Container -- defined by --> compose_project
+Headless_Container -- defined by --> compose_project
+configs_volume_Container -- defined by --> compose_project
+compose_project -- deployed to --> docker_portainer
+
+Gleaner_container -- image manual add --> docker_portainer
+Nabu_container -- image manual add --> docker_portainer
+
+Gleaner_container -- deployed by --> Dagster_Container
+Nabu_container -- deployed by --> Dagster_Container
+
+Gleaner_container-- deployed to --> docker_portainer
+Nabu_container-- deployed to --> docker_portainer
+
+Dagit_UI_Container -- Created by --> Github_action
+Dagster_Cotnainer -- Created by --> Github_action
+
+NabuConfig.tgz -- Archive to --> Nabu_container
+GleanerConfig.tfz -- Archive to --> Gleaner_container
+
+NabuConfig.tgz -- Stored in s3 --> s3
+GleanerConfig.tfz -- Stored in s3 --> s3
+
+configs_volume_Container -- populates volume --> dagster-project
+dagster-project -- has --> gleanerConfig.yaml
+dagster-project -- has --> nabuConfig.yaml
+```
+
+## Deploy
+
+### Deploy Dagster in Portainer
+You will need to deploy dagster contiainers to portainer, for a docker swarm
+0. get the portatinaer url, and auth token
+0. SSH to the make hosting the docker.
+
+1. Pull scheduler repo
+2. cd dagster/implnets/deployment
+3. create a copy of envFile.env and **edit env variables**
+ 4. PROJECT=eco
+ 5. GLEANERIO_MINIO_ADDRESS ++
+ 6. GLEANERIO_GRAPH_URL, GLEANERIO_GRAPH_NAMESPACE
+ 7. GLEANERIO_DOCKER_URL, GLEANERIO_PORTAINER_APIKEY
+ 8. SCHED_HOSTNAME defaults to sched
+5. as noted as noted in (Compose, Environment and Docker API Assets), deploy the configuration to s3.
+6. ~~create network and volumes needed `dagster_setup_docker.sh`~~
+7. manually add configs
+ 8. gleaner-{project}
+ 9. nabu-{project}
+ 10. workspace-{project}
+ 11. tenant-{project}
+ 11. dagster from:dagster/implnets/deployment/dagster.yaml
+7. add configs to S3/Minio.
+ 8. scheduler/configs/gleanerconfig.yml
+ 9. scheduler/configs/tenant.yml
+8. create a stack,
+ 9. gtibub repo: https://github.com/earthcube/scheduler.git
+ 10. branch: dev
+ 11. compose files: dagster/implnets/deployment/compose_project.yaml
+ 12. additional path: dagster/implnets/deployment/compose_project_eco_override.yaml
+
+
+
+
+
+
+
+
diff --git a/docs/images/materialize.png b/docs/images/materialize.png
new file mode 100644
index 00000000..a22a3f5f
Binary files /dev/null and b/docs/images/materialize.png differ
diff --git a/docs/images/overview_sensors_tab.png b/docs/images/overview_sensors_tab.png
new file mode 100644
index 00000000..d6717a0c
Binary files /dev/null and b/docs/images/overview_sensors_tab.png differ
diff --git a/docs/images/pycharm_runconfig.png b/docs/images/pycharm_runconfig.png
new file mode 100644
index 00000000..8a20630a
Binary files /dev/null and b/docs/images/pycharm_runconfig.png differ
diff --git a/docs/images/runs.png b/docs/images/runs.png
new file mode 100644
index 00000000..570d703b
Binary files /dev/null and b/docs/images/runs.png differ
diff --git a/docs/images/schedules.png b/docs/images/schedules.png
new file mode 100644
index 00000000..0fd65ff0
Binary files /dev/null and b/docs/images/schedules.png differ
diff --git a/docs/images/schedules_example.png b/docs/images/schedules_example.png
new file mode 100644
index 00000000..d51638ac
Binary files /dev/null and b/docs/images/schedules_example.png differ
diff --git a/docs/images/schedules_select.png b/docs/images/schedules_select.png
new file mode 100644
index 00000000..62b4d382
Binary files /dev/null and b/docs/images/schedules_select.png differ
diff --git a/docs/images/schedules_tab.png b/docs/images/schedules_tab.png
new file mode 100644
index 00000000..2366a838
Binary files /dev/null and b/docs/images/schedules_tab.png differ
diff --git a/docs/images/schedules_test.png b/docs/images/schedules_test.png
new file mode 100644
index 00000000..42af1f26
Binary files /dev/null and b/docs/images/schedules_test.png differ
diff --git a/docs/images/sources_sensor.png b/docs/images/sources_sensor.png
new file mode 100644
index 00000000..43361cf4
Binary files /dev/null and b/docs/images/sources_sensor.png differ
diff --git a/docs/ingest_workflow.md b/docs/ingest_workflow.md
new file mode 100644
index 00000000..6cb5438c
--- /dev/null
+++ b/docs/ingest_workflow.md
@@ -0,0 +1,78 @@
+# ingest workflow
+
+This is found in implnets/workflows/ingest
+
+
+```mermaid
+flowchart LR
+ subgraph dagster
+ subgraph sensors
+ s3_config_sources_sensor['sources_all_active']
+ s3_config_tenant_sensor['tenant with sources']
+ sources_sensor
+ release_file_sensor
+ tenant_names_sensor
+ tenant_namespaces_job
+ end
+ subgraph jobs
+ summon_and_release
+ sources_config_updated
+ tenant_release
+ tenant_config_updated
+ end
+ subgraph assets
+ source_names_active
+ sources_all
+ tenant_names
+ tenant_all
+ end
+
+
+ end
+ s3_config_sources_sensor--monitors --> sources_config
+ s3_config_tenant_sensor--monitors -->tenant_config
+ s3_config_sources_sensor--starts-->sources_config_updated
+ sources_config_updated--materializes-->source_names_active
+ sources_config_updated--materializes-->sources_all
+ s3_config_tenant_sensor--starts-->tenant_config_updated
+ tenant_config_updated--creates-->tenant_names
+ tenant_config_updated--creates-->tenant_all
+ sources_sensor--monitors-->sources_all
+ sources_sensor--starts-->summon_and_release
+ summon_and_release--starts--> gleanerio
+ gleanerio-->summon
+ gleanerio-->graph_path
+ tenant_names-->tenant_names_sensor
+ tenant_names_sensor--starts-->tenant_namespaces_job
+ tenant_namespaces_job--creates--> tenant_namespace
+ tenant_namespaces_job--creates-->tenant_summary_namespace
+ release_file_sensor--monitors-->graph_path
+ release_file_sensor--loads-->tenant_namespace
+ release_file_sensor--loads-->tenant_summary_namespace
+
+ subgraph portainer
+ gleanerio
+ tenant_ui
+ subgraph services
+ subgraph triplestore
+ tenant_namespace
+ tenant_summary_namespace
+ end
+
+ subgraph minio_s3
+ subgraph bucket_paths
+ subgraph scheduler
+ sources_config["`scheduler/configs/gleanerconfig.yaml`"]
+ tenant_config["`scheduler/configs/tenant.yaml`"]
+ logs
+ end
+ summon
+ graph_path['graph']
+ end
+ end
+
+ end
+ end
+
+
+```
diff --git a/docs/quick.md b/docs/quick.md
index 5091a718..7da525de 100644
--- a/docs/quick.md
+++ b/docs/quick.md
@@ -1,56 +1,37 @@
# Notes
-## Implementation network builder
+### Run Deploy Dagster locally (ROUGH)
+Dagster needs a docker instance to run Gleanerio. We usually do this in a remote container.
+Basically, you can run a single workflow with the UI from that workflows directory with a `dagster run`
+
+You will need to deploy dagster contiainers to portainer, for a docker swarm
+0. get the portatinaer url, and auth token
+0. SSH to the make hosting the docker.
+
+1. Pull scheduler repo
+2. cd dagster/implnets/deployment
+3. create a copy of envFile.env and **edit env variables**
+ 4. PROJECT=eco
+ 5. GLEANERIO_MINIO_ADDRESS ++
+ 6. GLEANERIO_GRAPH_URL, GLEANERIO_GRAPH_NAMESPACE
+ 7. GLEANERIO_DOCKER_URL, GLEANERIO_PORTAINER_APIKEY
+ 8. SCHED_HOSTNAME defaults to sched
+5. as noted as noted in (Compose, Environment and Docker API Assets), deploy the configuration to s3.
+6. ~~create network and volumes needed `dagster_setup_docker.sh`~~
+7. manually add configs
+ 8. gleaner-{project}
+ 9. nabu-{project}
+ 10. workspace-{project}
+ 11. tenant-{project}
+ 11. dagster from:dagster/implnets/deployment/dagster.yaml
+7. add configs to S3/Minio.
+ 8. scheduler/configs/gleanerconfig.yml
+ 9. scheduler/configs/tenant.yml
+8. then you can run a command. in runCOnfigs there are PyCharm run files
+ 9. set ENV
+ 10. GLEANERIO_GLEANER_CONFIG_PATH=/Users/valentin/development/dev_earthcube/scheduler/dagster/implnets/configs/eco/gleanerconfig.yaml
+ 11. `cd dagster/implnets/workflows/ingest`
+ 12. `dagster run`
+
+**NEED MORE EXAMPLES**
-The work for building the dagster containers for a given implementation network starts in
-the directory ```scheduler/dagster/implnets```. At this time most of this can be driven by
-the Makefile.
-
-1) Make sure your gleanerconfig.yaml file is in the configs/NETWORK directory where
- NETWORK is your implmentation network like eco, iow, etc.
-2) Check the VERSION file and make sure it has a value you want in it to be tagged to the containers.
-3) ```make eco-clean``` will remove any existing generated code from the ./generatedCode/implnet-NETWORK directory
-4) ```make eco-generate``` will build the code new. Set the -d N in the makefile to a value N that is the number
- of days you want the runs to cycle over. So 30 would mean they run once every 30 days. If you want some providers
- to index at different rates you currently need to go in and edit the associated provider _schedules_ file editing the
- line ```@schedule(cron_schedule="0 12 * * 6", job=implnet_job_amgeo, execution_timezone="US/Central")``` with a
- cron value you want.
-5) ```make eco-build``` builds the Docker images following the build file ./build/Docker file. Note this uses the
- command line argument ```--build-arg implnet=eco``` to set the implementation NETWORK so that the correct build code
- from _generatedCode/NETWORK_ is copied over
-6) ```make eco-push``` push to your container registry of choice, here docker.io
-
-
-
-## Compose, Environment and Docker API Assets
-
-1) You will need the (or need to make) the portainer access token
- from your https://portainer.geocodes-aws-dev.earthcube.org/#!/account
-2) You will need a valid Gleaner configuration file named gleanerconfig.yaml and a nabu config named nabuconfig.yaml
-3) You will need the schema.org context files places in a directory _assets_ get each of the http and https versions
- 1) ```wget https://schema.org/version/latest/schemaorg-current-https.jsonld```
- 2) ```wget https://schema.org/version/latest/schemaorg-current-http.jsonld```
-4) Generate the archive files for Gleaner and Nabu. Note the path to the context
- files should map with what is in the configuration files
- 1) ```tar -zcf ./archives/NabuCfg.tgz ./nabuconfig.yaml ./assets```
- 2) ```tar -zcf ./archives/GleanerCfg.tgz ./gleanerconfig.yaml ./assets```
-5) The archives .tgz files named _NabuCfg.tgz_ and _GleanerCfg.tgz_ need to be copied to the schedule prefix
- in your bucket used for Gleaner
- 1) ```mc cp GleanerCfg.tgz NabuCfg.tgz gleaner/scheduler/configs```
- 2) Make sure GLEANERIO_NABU_ARCHIVE_OBJECT and GLEANERIO_GLEANER_ARCHIVE_OBJECT reflect this location in the .env file
-6) Next you will need to build the scheduler containers for your implementation network. Push these containers
- to your container registry of choice and make sure the values are set in the .env file and that
- the containers are available to Portainer or will get pulled on use. These are the image files in the
- compose file and also the images notes in the environment variables GLEANERIO_GLEANER_IMAGE and GLEANERIO_NABU_IMAGE
- in the .env file.
-
-
-At this point you are ready to move to your Docker or Portainer environment and deploy the
-compose and environment files.
-
-## Notes
-
-1) I do not have the API call to ensure/check/pull and image used by the API, so these images need to be
- pulled down manually at this time. These are the images noted by the .env files at
- ```GLEANERIO_GLEANER_IMAGE=fils/gleaner:v3.0.11-development-df``` and
- ```GLEANERIO_NABU_IMAGE=fils/nabu:2.0.8-development```
diff --git a/docs/refactor.md b/docs/refactor.md
new file mode 100644
index 00000000..dc9dd359
--- /dev/null
+++ b/docs/refactor.md
@@ -0,0 +1,33 @@
+# refactoring
+
+## rework code to generate less.
+* ~~just generate the graph, and some configuration loading~~ done
+* ~~pass the ops the configuration~~
+
+## can we use s3 manager to store some assets?
+* ~~reports seems like the ideal use case for these.~~ works
+
+## handle multiple workflows
+* ~~need to add ability to deploy some other workflows~~ works
+
+
+### Handle Multiple Organizations
+* Each organization can be in a container with its own code workflow.
+* If we can standardize the loading and transforming workflows as much as possible, then the graph loading workflows
+ should be more customizable
+* to add a container, you need to edit the workflows.yaml in an organizations configuration
+
+### possible workflows
+* ~~timeseries after final graph~~ done
+ * ~~generate a csv of the load reports size of (sitemap, summoned, summon failure, milled, loaded to graph, datasets)~~
+
+* ~~weekly summary~~ done
+ * ~~what is the size of the graph this week.~~
+* post s3 check, as an approval check.
+ * do these not contain JSONLD
+ * store as asset, or maybe have file we publish as 'approved/expected non-summoned
+* sitemap check
+ * just run a sitemap head to see that url work, and exist, weekly.
+ * publish as paritioned data in s3 ;)
+* shacl... should we shacl releases.
+ * if so, then maybe teach dagster to watch the graph/latest for changes.
diff --git a/mkdocs.yml b/mkdocs.yml
index b41edfc5..a5b5a815 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -9,7 +9,7 @@ theme:
- navigation.sections
markdown_extensions:
- toc:
- permalink:
+ permalink:
- attr_list
- def_list
- tables
@@ -26,9 +26,9 @@ plugins:
- multirepo:
# (optional) tells multirepo to cleanup the temporary directory after site is built.
cleanup: true
- - mermaid2
+ - mermaid2
- literate-nav
- - mkdocs-jupyter
+ - mkdocs-jupyter
- callouts
# get a NoneType error, even when trying to generate in Geocodes-Metadata
# - schema_reader:
@@ -36,7 +36,15 @@ plugins:
# - "./docs/GeoCODES-Metadata/schemas/"
nav:
- Gleaner IO Scheduler:
- - Dagster: README.md
+ - Scheduler: README.md
+ - Ingest Workflow: ingest_workflow.md
- Quick: quick.md
- Add Containers: add_containers.md
+ - Scheduler Deployment: eco_deploy.md
+
+ - Develeopment:
+ - Local Developement: README_LOCAL_DEVELOPMENT.md
+ - Developing Schedules: developement.md
+ - Troubleshooting Workflows: monitoring_workflows.md
+
diff --git a/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml b/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml
new file mode 100644
index 00000000..e5764469
--- /dev/null
+++ b/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_eco_debug .run.xml b/runConfigurations/dagster_eco_debug .run.xml
index 324befdc..13969f7a 100644
--- a/runConfigurations/dagster_eco_debug .run.xml
+++ b/runConfigurations/dagster_eco_debug .run.xml
@@ -1,13 +1,15 @@
+
+
-
+
@@ -16,7 +18,7 @@
-
+
@@ -33,4 +35,4 @@
-
\ No newline at end of file
+
diff --git a/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml b/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml
new file mode 100644
index 00000000..9383d2e6
--- /dev/null
+++ b/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_ecrr__job examples.run.xml b/runConfigurations/dagster_ecrr__job examples.run.xml
new file mode 100644
index 00000000..d5b79bf8
--- /dev/null
+++ b/runConfigurations/dagster_ecrr__job examples.run.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_ecrr_debug .run.xml b/runConfigurations/dagster_ecrr_debug .run.xml
new file mode 100644
index 00000000..00996fe8
--- /dev/null
+++ b/runConfigurations/dagster_ecrr_debug .run.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_ecrr_run list.run.xml b/runConfigurations/dagster_ecrr_run list.run.xml
new file mode 100644
index 00000000..335ca80c
--- /dev/null
+++ b/runConfigurations/dagster_ecrr_run list.run.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_ingest_debug (1).run.xml b/runConfigurations/dagster_ingest_debug (1).run.xml
new file mode 100644
index 00000000..f8db179a
--- /dev/null
+++ b/runConfigurations/dagster_ingest_debug (1).run.xml
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_ingest_materialize.run.xml b/runConfigurations/dagster_ingest_materialize.run.xml
new file mode 100644
index 00000000..82f90cdf
--- /dev/null
+++ b/runConfigurations/dagster_ingest_materialize.run.xml
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_tasks_debug .run.xml b/runConfigurations/dagster_tasks_debug .run.xml
new file mode 100644
index 00000000..627f0293
--- /dev/null
+++ b/runConfigurations/dagster_tasks_debug .run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml b/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml
new file mode 100644
index 00000000..3edc9ff7
--- /dev/null
+++ b/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml b/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml
new file mode 100644
index 00000000..ccac142f
--- /dev/null
+++ b/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml b/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml
new file mode 100644
index 00000000..6f8cf336
--- /dev/null
+++ b/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_tasks_materialize sos_types.run.xml b/runConfigurations/dagster_tasks_materialize sos_types.run.xml
new file mode 100644
index 00000000..984baf8e
--- /dev/null
+++ b/runConfigurations/dagster_tasks_materialize sos_types.run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/dagster_tasks_materialize tennants.run.xml b/runConfigurations/dagster_tasks_materialize tennants.run.xml
new file mode 100644
index 00000000..2c3cf813
--- /dev/null
+++ b/runConfigurations/dagster_tasks_materialize tennants.run.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/runConfigurations/pygen eco v1.run.xml b/runConfigurations/pygen eco v1.run.xml
index 2b180505..528a1abd 100644
--- a/runConfigurations/pygen eco v1.run.xml
+++ b/runConfigurations/pygen eco v1.run.xml
@@ -7,8 +7,9 @@
+
-
+
diff --git a/workspace.yaml b/workspace.yaml
new file mode 100644
index 00000000..2f012e73
--- /dev/null
+++ b/workspace.yaml
@@ -0,0 +1,7 @@
+load_from:
+ - python_file:
+ relative_path: "project/repositories/repository.py"
+ working_directory: .
+ - python_file:
+ relative_path: "gleanerio/repositories/repository.py"
+ working_directory: .